{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from splink.duckdb.duckdb_linker import DuckDBLinker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uncorrupted_record</th>\n",
       "      <th>cluster</th>\n",
       "      <th>full_name</th>\n",
       "      <th>dob</th>\n",
       "      <th>birth_place</th>\n",
       "      <th>postcode_fake</th>\n",
       "      <th>lat</th>\n",
       "      <th>lng</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>unique_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>True</td>\n",
       "      <td>Q2296770</td>\n",
       "      <td>thomas clifford, 1st baron clifford of chudleigh</td>\n",
       "      <td>1630-08-01</td>\n",
       "      <td>Devon</td>\n",
       "      <td>TQ13 8DF</td>\n",
       "      <td>50.692449</td>\n",
       "      <td>-3.813964</td>\n",
       "      <td>male</td>\n",
       "      <td>politician</td>\n",
       "      <td>Q2296770-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>Q2296770</td>\n",
       "      <td>thomas of chudleigh</td>\n",
       "      <td>1630-08-01</td>\n",
       "      <td>Devon</td>\n",
       "      <td>TQ13 8DF</td>\n",
       "      <td>50.692449</td>\n",
       "      <td>-3.813964</td>\n",
       "      <td>male</td>\n",
       "      <td>politician</td>\n",
       "      <td>Q2296770-2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>Q2296770</td>\n",
       "      <td>tom 1st baron clifford of chudleigh</td>\n",
       "      <td>1630-08-01</td>\n",
       "      <td>Devon</td>\n",
       "      <td>TQ13 8DF</td>\n",
       "      <td>50.692449</td>\n",
       "      <td>-3.813964</td>\n",
       "      <td>male</td>\n",
       "      <td>politician</td>\n",
       "      <td>Q2296770-3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>Q2296770</td>\n",
       "      <td>thomas 1st chudleigh</td>\n",
       "      <td>1630-08-01</td>\n",
       "      <td>Devon</td>\n",
       "      <td>TQ13 8HU</td>\n",
       "      <td>50.687638</td>\n",
       "      <td>-3.895877</td>\n",
       "      <td>None</td>\n",
       "      <td>politician</td>\n",
       "      <td>Q2296770-4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>Q2296770</td>\n",
       "      <td>thomas clifford, 1st baron chudleigh</td>\n",
       "      <td>1630-08-01</td>\n",
       "      <td>Devon</td>\n",
       "      <td>TQ13 8DF</td>\n",
       "      <td>50.692449</td>\n",
       "      <td>-3.813964</td>\n",
       "      <td>None</td>\n",
       "      <td>politician</td>\n",
       "      <td>Q2296770-5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   uncorrupted_record   cluster  \\\n",
       "0                True  Q2296770   \n",
       "1               False  Q2296770   \n",
       "2               False  Q2296770   \n",
       "3               False  Q2296770   \n",
       "4               False  Q2296770   \n",
       "\n",
       "                                          full_name         dob birth_place  \\\n",
       "0  thomas clifford, 1st baron clifford of chudleigh  1630-08-01       Devon   \n",
       "1                               thomas of chudleigh  1630-08-01       Devon   \n",
       "2               tom 1st baron clifford of chudleigh  1630-08-01       Devon   \n",
       "3                              thomas 1st chudleigh  1630-08-01       Devon   \n",
       "4              thomas clifford, 1st baron chudleigh  1630-08-01       Devon   \n",
       "\n",
       "  postcode_fake        lat       lng gender  occupation   unique_id  \n",
       "0      TQ13 8DF  50.692449 -3.813964   male  politician  Q2296770-1  \n",
       "1      TQ13 8DF  50.692449 -3.813964   male  politician  Q2296770-2  \n",
       "2      TQ13 8DF  50.692449 -3.813964   male  politician  Q2296770-3  \n",
       "3      TQ13 8HU  50.687638 -3.895877   None  politician  Q2296770-4  \n",
       "4      TQ13 8DF  50.692449 -3.813964   None  politician  Q2296770-5  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd \n",
    "pd.options.display.max_rows = 1000\n",
    "df = pd.read_parquet(\"./data/historical_figures_with_errors_50k.parquet\")\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>unique_id</th>\n",
       "      <th>cluster</th>\n",
       "      <th>full_name</th>\n",
       "      <th>first_and_surname</th>\n",
       "      <th>first_name</th>\n",
       "      <th>surname</th>\n",
       "      <th>dob</th>\n",
       "      <th>birth_place</th>\n",
       "      <th>postcode_fake</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Q2296770-1</td>\n",
       "      <td>Q2296770</td>\n",
       "      <td>thomas clifford, 1st baron clifford of chudleigh</td>\n",
       "      <td>thomas chudleigh</td>\n",
       "      <td>thomas</td>\n",
       "      <td>chudleigh</td>\n",
       "      <td>1630-08-01</td>\n",
       "      <td>devon</td>\n",
       "      <td>tq13 8df</td>\n",
       "      <td>male</td>\n",
       "      <td>politician</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Q2296770-2</td>\n",
       "      <td>Q2296770</td>\n",
       "      <td>thomas of chudleigh</td>\n",
       "      <td>thomas chudleigh</td>\n",
       "      <td>thomas</td>\n",
       "      <td>chudleigh</td>\n",
       "      <td>1630-08-01</td>\n",
       "      <td>devon</td>\n",
       "      <td>tq13 8df</td>\n",
       "      <td>male</td>\n",
       "      <td>politician</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    unique_id   cluster                                         full_name  \\\n",
       "0  Q2296770-1  Q2296770  thomas clifford, 1st baron clifford of chudleigh   \n",
       "1  Q2296770-2  Q2296770                               thomas of chudleigh   \n",
       "\n",
       "  first_and_surname first_name    surname         dob birth_place  \\\n",
       "0  thomas chudleigh     thomas  chudleigh  1630-08-01       devon   \n",
       "1  thomas chudleigh     thomas  chudleigh  1630-08-01       devon   \n",
       "\n",
       "  postcode_fake gender  occupation  \n",
       "0      tq13 8df   male  politician  \n",
       "1      tq13 8df   male  politician  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "def clean_df(df):\n",
    "    cols = [\n",
    "        \"unique_id\",\n",
    "        \"cluster\",\n",
    "        \"full_name\",\n",
    "        \"dob\",\n",
    "        \"birth_place\",\n",
    "        \"postcode_fake\",\n",
    "        \"gender\",\n",
    "        \"occupation\",\n",
    "    ]\n",
    "\n",
    "    df = df[cols].copy()\n",
    "\n",
    "    df[\"name_split\"] = df[\"full_name\"].str.strip().str.split(\" \")\n",
    "    df[\"name_split_length\"] = df[\"name_split\"].str.len()\n",
    "    df[\"first_name\"] = df[\"name_split\"].str[0]\n",
    "    df[\"surname\"] = df[\"name_split\"].str[-1]\n",
    "    df[\"surname\"] = np.where(df[\"name_split_length\"] > 1, df[\"surname\"], \"\")\n",
    "    # df[\"middle_names\"] = df[\"name_split\"].str[1:-1]\n",
    "\n",
    "    df[\"first_and_surname\"] = df[\"first_name\"] + \" \" + df[\"surname\"]\n",
    "\n",
    "    for col in [\n",
    "        \"full_name\",\n",
    "        \"first_and_surname\",\n",
    "        \"first_name\",\n",
    "        \"surname\",\n",
    "        \"dob\",\n",
    "        \"birth_place\",\n",
    "        \"postcode_fake\",\n",
    "        \"gender\",\n",
    "        \"occupation\",\n",
    "    ]:\n",
    "        df[col] = df[col].str.lower().str.strip()\n",
    "        df[col] = df[col].replace({\"\": None})\n",
    "\n",
    "    cols = [\n",
    "        \"unique_id\",\n",
    "        \"cluster\",\n",
    "        \"full_name\",\n",
    "        \"first_and_surname\",\n",
    "        \"first_name\",\n",
    "        \"surname\",\n",
    "        \"dob\",\n",
    "        \"birth_place\",\n",
    "        \"postcode_fake\",\n",
    "        \"gender\",\n",
    "        \"occupation\",\n",
    "    ]\n",
    "    return df[cols]\n",
    "\n",
    "\n",
    "df_clean = clean_df(df)\n",
    "df_clean.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.vegalite.v4+json": {
       "$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
       "config": {
        "view": {
         "continuousHeight": 300,
         "continuousWidth": 400
        }
       },
       "vconcat": [
        {
         "hconcat": [
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.9449625015258789,
              "percentile_inc_nulls": 0.9450353980064392,
              "sum_tokens_in_value_count_group": 2780,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 2780
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.8907960653305054,
              "percentile_inc_nulls": 0.8909407258033752,
              "sum_tokens_in_value_count_group": 2736,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 2736
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.8621290326118469,
              "percentile_inc_nulls": 0.8623116612434387,
              "sum_tokens_in_value_count_group": 1448,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 1448
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.8341153264045715,
              "percentile_inc_nulls": 0.8343350887298584,
              "sum_tokens_in_value_count_group": 1415,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 1415
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.8082596063613892,
              "percentile_inc_nulls": 0.8085135817527771,
              "sum_tokens_in_value_count_group": 1306,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 1306
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.7832155227661133,
              "percentile_inc_nulls": 0.7835026979446411,
              "sum_tokens_in_value_count_group": 1265,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 1265
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.7582308650016785,
              "percentile_inc_nulls": 0.7585511207580566,
              "sum_tokens_in_value_count_group": 1262,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 1262
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.7341569066047668,
              "percentile_inc_nulls": 0.7345091104507446,
              "sum_tokens_in_value_count_group": 1216,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 1216
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.7161211967468262,
              "percentile_inc_nulls": 0.7164973020553589,
              "sum_tokens_in_value_count_group": 911,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 911
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.698620080947876,
              "percentile_inc_nulls": 0.6990193128585815,
              "sum_tokens_in_value_count_group": 884,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 884
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6826632022857666,
              "percentile_inc_nulls": 0.6830835342407227,
              "sum_tokens_in_value_count_group": 806,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 806
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6697155237197876,
              "percentile_inc_nulls": 0.670153021812439,
              "sum_tokens_in_value_count_group": 654,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 654
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6573221683502197,
              "percentile_inc_nulls": 0.6577761173248291,
              "sum_tokens_in_value_count_group": 626,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 626
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6471858024597168,
              "percentile_inc_nulls": 0.6476531028747559,
              "sum_tokens_in_value_count_group": 512,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 512
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6370691657066345,
              "percentile_inc_nulls": 0.6375499367713928,
              "sum_tokens_in_value_count_group": 511,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 511
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6269921064376831,
              "percentile_inc_nulls": 0.6274862289428711,
              "sum_tokens_in_value_count_group": 509,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 509
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6171329021453857,
              "percentile_inc_nulls": 0.6176400780677795,
              "sum_tokens_in_value_count_group": 498,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 498
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.6079269647598267,
              "percentile_inc_nulls": 0.6084463596343994,
              "sum_tokens_in_value_count_group": 465,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 465
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5987408757209778,
              "percentile_inc_nulls": 0.5992723703384399,
              "sum_tokens_in_value_count_group": 464,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 464
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5904456377029419,
              "percentile_inc_nulls": 0.5909881591796875,
              "sum_tokens_in_value_count_group": 419,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 419
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5831403136253357,
              "percentile_inc_nulls": 0.5836925506591797,
              "sum_tokens_in_value_count_group": 369,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 369
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5762308835983276,
              "percentile_inc_nulls": 0.5767922401428223,
              "sum_tokens_in_value_count_group": 349,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 349
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5698164701461792,
              "percentile_inc_nulls": 0.5703862905502319,
              "sum_tokens_in_value_count_group": 324,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 324
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5634812116622925,
              "percentile_inc_nulls": 0.5640594959259033,
              "sum_tokens_in_value_count_group": 320,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 320
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5573043823242188,
              "percentile_inc_nulls": 0.557890772819519,
              "sum_tokens_in_value_count_group": 312,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 312
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5513848066329956,
              "percentile_inc_nulls": 0.551979124546051,
              "sum_tokens_in_value_count_group": 299,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 299
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5457029342651367,
              "percentile_inc_nulls": 0.5463047027587891,
              "sum_tokens_in_value_count_group": 287,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 287
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5401991605758667,
              "percentile_inc_nulls": 0.5408082604408264,
              "sum_tokens_in_value_count_group": 278,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 278
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5348538160324097,
              "percentile_inc_nulls": 0.5354700088500977,
              "sum_tokens_in_value_count_group": 270,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 270
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5298252105712891,
              "percentile_inc_nulls": 0.5304480195045471,
              "sum_tokens_in_value_count_group": 254,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 254
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5250935554504395,
              "percentile_inc_nulls": 0.5257226228713989,
              "sum_tokens_in_value_count_group": 239,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 239
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5204806923866272,
              "percentile_inc_nulls": 0.5211158990859985,
              "sum_tokens_in_value_count_group": 233,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 233
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5164815187454224,
              "percentile_inc_nulls": 0.5171220302581787,
              "sum_tokens_in_value_count_group": 202,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 202
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.51287841796875,
              "percentile_inc_nulls": 0.5135236978530884,
              "sum_tokens_in_value_count_group": 182,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 182
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5092949867248535,
              "percentile_inc_nulls": 0.5099450349807739,
              "sum_tokens_in_value_count_group": 181,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 181
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.5058304071426392,
              "percentile_inc_nulls": 0.5064850449562073,
              "sum_tokens_in_value_count_group": 175,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 175
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.49898040294647217,
              "percentile_inc_nulls": 0.49964410066604614,
              "sum_tokens_in_value_count_group": 346,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 173
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.49563461542129517,
              "percentile_inc_nulls": 0.4963027238845825,
              "sum_tokens_in_value_count_group": 169,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 169
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4923086166381836,
              "percentile_inc_nulls": 0.4929811358451843,
              "sum_tokens_in_value_count_group": 168,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 168
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4890221953392029,
              "percentile_inc_nulls": 0.4896990656852722,
              "sum_tokens_in_value_count_group": 166,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 166
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.48587435483932495,
              "percentile_inc_nulls": 0.48655539751052856,
              "sum_tokens_in_value_count_group": 159,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 159
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4827859401702881,
              "percentile_inc_nulls": 0.4834710955619812,
              "sum_tokens_in_value_count_group": 156,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 156
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.47664862871170044,
              "percentile_inc_nulls": 0.4773419499397278,
              "sum_tokens_in_value_count_group": 310,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 155
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4705509543418884,
              "percentile_inc_nulls": 0.47125232219696045,
              "sum_tokens_in_value_count_group": 308,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 154
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4675813317298889,
              "percentile_inc_nulls": 0.4682866334915161,
              "sum_tokens_in_value_count_group": 150,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 150
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.46471065282821655,
              "percentile_inc_nulls": 0.4654197692871094,
              "sum_tokens_in_value_count_group": 145,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 145
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.46195876598358154,
              "percentile_inc_nulls": 0.46267151832580566,
              "sum_tokens_in_value_count_group": 139,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 139
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.459286093711853,
              "percentile_inc_nulls": 0.4600023627281189,
              "sum_tokens_in_value_count_group": 135,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 135
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.45405954122543335,
              "percentile_inc_nulls": 0.45478272438049316,
              "sum_tokens_in_value_count_group": 264,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 132
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.451485812664032,
              "percentile_inc_nulls": 0.45221245288848877,
              "sum_tokens_in_value_count_group": 130,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 130
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4464572072029114,
              "percentile_inc_nulls": 0.44719046354293823,
              "sum_tokens_in_value_count_group": 254,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 127
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.44402211904525757,
              "percentile_inc_nulls": 0.4447585940361023,
              "sum_tokens_in_value_count_group": 123,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 123
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.44164639711380005,
              "percentile_inc_nulls": 0.44238603115081787,
              "sum_tokens_in_value_count_group": 120,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 120
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4393102526664734,
              "percentile_inc_nulls": 0.44005298614501953,
              "sum_tokens_in_value_count_group": 118,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 118
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4370335340499878,
              "percentile_inc_nulls": 0.4377792477607727,
              "sum_tokens_in_value_count_group": 115,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 115
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4347766041755676,
              "percentile_inc_nulls": 0.4355252981185913,
              "sum_tokens_in_value_count_group": 114,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 114
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4325592517852783,
              "percentile_inc_nulls": 0.43331092596054077,
              "sum_tokens_in_value_count_group": 112,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 112
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4304013252258301,
              "percentile_inc_nulls": 0.43115586042404175,
              "sum_tokens_in_value_count_group": 109,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 109
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4283621311187744,
              "percentile_inc_nulls": 0.42911940813064575,
              "sum_tokens_in_value_count_group": 103,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 103
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.42634278535842896,
              "percentile_inc_nulls": 0.4271026849746704,
              "sum_tokens_in_value_count_group": 102,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 102
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4223436713218689,
              "percentile_inc_nulls": 0.42310887575149536,
              "sum_tokens_in_value_count_group": 202,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 101
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4203639030456543,
              "percentile_inc_nulls": 0.4211317300796509,
              "sum_tokens_in_value_count_group": 100,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 100
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.418443500995636,
              "percentile_inc_nulls": 0.4192138910293579,
              "sum_tokens_in_value_count_group": 97,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 97
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4165429472923279,
              "percentile_inc_nulls": 0.41731584072113037,
              "sum_tokens_in_value_count_group": 96,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 96
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4146621823310852,
              "percentile_inc_nulls": 0.4154375195503235,
              "sum_tokens_in_value_count_group": 95,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 95
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4128011465072632,
              "percentile_inc_nulls": 0.4135790467262268,
              "sum_tokens_in_value_count_group": 94,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 94
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4055156111717224,
              "percentile_inc_nulls": 0.4063031077384949,
              "sum_tokens_in_value_count_group": 368,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 92
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.4001108407974243,
              "percentile_inc_nulls": 0.4009055495262146,
              "sum_tokens_in_value_count_group": 273,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 91
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.39832907915115356,
              "percentile_inc_nulls": 0.3991261124610901,
              "sum_tokens_in_value_count_group": 90,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 90
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3948447108268738,
              "percentile_inc_nulls": 0.395646333694458,
              "sum_tokens_in_value_count_group": 176,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 88
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.39139991998672485,
              "percentile_inc_nulls": 0.392206072807312,
              "sum_tokens_in_value_count_group": 174,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 87
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.38971710205078125,
              "percentile_inc_nulls": 0.3905255198478699,
              "sum_tokens_in_value_count_group": 85,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 85
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3864702582359314,
              "percentile_inc_nulls": 0.3872830271720886,
              "sum_tokens_in_value_count_group": 164,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 82
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3848666548728943,
              "percentile_inc_nulls": 0.38568150997161865,
              "sum_tokens_in_value_count_group": 81,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 81
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3816990256309509,
              "percentile_inc_nulls": 0.38251811265945435,
              "sum_tokens_in_value_count_group": 160,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 80
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3801349997520447,
              "percentile_inc_nulls": 0.38095617294311523,
              "sum_tokens_in_value_count_group": 79,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 79
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.37859082221984863,
              "percentile_inc_nulls": 0.3794139623641968,
              "sum_tokens_in_value_count_group": 78,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 78
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.37706637382507324,
              "percentile_inc_nulls": 0.3778916001319885,
              "sum_tokens_in_value_count_group": 77,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 77
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.37556177377700806,
              "percentile_inc_nulls": 0.3763889670372009,
              "sum_tokens_in_value_count_group": 76,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 76
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3740769624710083,
              "percentile_inc_nulls": 0.374906063079834,
              "sum_tokens_in_value_count_group": 75,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 75
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3726118803024292,
              "percentile_inc_nulls": 0.37344300746917725,
              "sum_tokens_in_value_count_group": 74,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 74
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3697214722633362,
              "percentile_inc_nulls": 0.3705563545227051,
              "sum_tokens_in_value_count_group": 146,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 73
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.36687058210372925,
              "percentile_inc_nulls": 0.36770927906036377,
              "sum_tokens_in_value_count_group": 144,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 72
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3654649257659912,
              "percentile_inc_nulls": 0.36630553007125854,
              "sum_tokens_in_value_count_group": 71,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 71
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3640791177749634,
              "percentile_inc_nulls": 0.364921510219574,
              "sum_tokens_in_value_count_group": 70,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 70
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.362713098526001,
              "percentile_inc_nulls": 0.36355727910995483,
              "sum_tokens_in_value_count_group": 69,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 69
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3586743474006653,
              "percentile_inc_nulls": 0.3595238924026489,
              "sum_tokens_in_value_count_group": 204,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 68
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.35734790563583374,
              "percentile_inc_nulls": 0.35819923877716064,
              "sum_tokens_in_value_count_group": 67,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 67
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.35606104135513306,
              "percentile_inc_nulls": 0.35691410303115845,
              "sum_tokens_in_value_count_group": 65,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 65
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3547940254211426,
              "percentile_inc_nulls": 0.3556486964225769,
              "sum_tokens_in_value_count_group": 64,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 64
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3522995114326477,
              "percentile_inc_nulls": 0.35315752029418945,
              "sum_tokens_in_value_count_group": 126,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 63
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3510720729827881,
              "percentile_inc_nulls": 0.35193169116973877,
              "sum_tokens_in_value_count_group": 62,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 62
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.34744906425476074,
              "percentile_inc_nulls": 0.34831351041793823,
              "sum_tokens_in_value_count_group": 183,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 61
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3427768349647522,
              "percentile_inc_nulls": 0.34364742040634155,
              "sum_tokens_in_value_count_group": 236,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 59
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.33933204412460327,
              "percentile_inc_nulls": 0.34020721912384033,
              "sum_tokens_in_value_count_group": 174,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 58
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3382035493850708,
              "percentile_inc_nulls": 0.33908021450042725,
              "sum_tokens_in_value_count_group": 57,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 57
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.33709490299224854,
              "percentile_inc_nulls": 0.33797305822372437,
              "sum_tokens_in_value_count_group": 56,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 56
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.33602583408355713,
              "percentile_inc_nulls": 0.3369053602218628,
              "sum_tokens_in_value_count_group": 54,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 54
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3328779935836792,
              "percentile_inc_nulls": 0.33376169204711914,
              "sum_tokens_in_value_count_group": 159,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 53
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3287600874900818,
              "percentile_inc_nulls": 0.32964926958084106,
              "sum_tokens_in_value_count_group": 208,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 52
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.32674068212509155,
              "percentile_inc_nulls": 0.3276325464248657,
              "sum_tokens_in_value_count_group": 102,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 51
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.32575082778930664,
              "percentile_inc_nulls": 0.32664400339126587,
              "sum_tokens_in_value_count_group": 50,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 50
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3238106369972229,
              "percentile_inc_nulls": 0.32470637559890747,
              "sum_tokens_in_value_count_group": 98,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 49
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.32095980644226074,
              "percentile_inc_nulls": 0.32185930013656616,
              "sum_tokens_in_value_count_group": 144,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 48
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3200293183326721,
              "percentile_inc_nulls": 0.3209300637245178,
              "sum_tokens_in_value_count_group": 47,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 47
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.31729722023010254,
              "percentile_inc_nulls": 0.31820160150527954,
              "sum_tokens_in_value_count_group": 138,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 46
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.31373363733291626,
              "percentile_inc_nulls": 0.3146427273750305,
              "sum_tokens_in_value_count_group": 180,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 45
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.30937814712524414,
              "percentile_inc_nulls": 0.3102930188179016,
              "sum_tokens_in_value_count_group": 220,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 44
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.3076755404472351,
              "percentile_inc_nulls": 0.30859267711639404,
              "sum_tokens_in_value_count_group": 86,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 43
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.30434954166412354,
              "percentile_inc_nulls": 0.30527108907699585,
              "sum_tokens_in_value_count_group": 168,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 42
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.30353784561157227,
              "percentile_inc_nulls": 0.30446046590805054,
              "sum_tokens_in_value_count_group": 41,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 41
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2987864017486572,
              "percentile_inc_nulls": 0.2997152805328369,
              "sum_tokens_in_value_count_group": 240,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 40
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.29569798707962036,
              "percentile_inc_nulls": 0.2966309189796448,
              "sum_tokens_in_value_count_group": 156,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 39
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2934410572052002,
              "percentile_inc_nulls": 0.29437702894210815,
              "sum_tokens_in_value_count_group": 114,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 38
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2912434935569763,
              "percentile_inc_nulls": 0.29218238592147827,
              "sum_tokens_in_value_count_group": 111,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 37
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2891647219657898,
              "percentile_inc_nulls": 0.2901063561439514,
              "sum_tokens_in_value_count_group": 105,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 35
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.28579914569854736,
              "percentile_inc_nulls": 0.28674525022506714,
              "sum_tokens_in_value_count_group": 170,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 34
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.281879186630249,
              "percentile_inc_nulls": 0.28283047676086426,
              "sum_tokens_in_value_count_group": 198,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 33
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2799786329269409,
              "percentile_inc_nulls": 0.2809324264526367,
              "sum_tokens_in_value_count_group": 96,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 32
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2775236964225769,
              "percentile_inc_nulls": 0.27848076820373535,
              "sum_tokens_in_value_count_group": 124,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 31
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.27336621284484863,
              "percentile_inc_nulls": 0.2743287682533264,
              "sum_tokens_in_value_count_group": 210,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 30
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.27049553394317627,
              "percentile_inc_nulls": 0.2714619040489197,
              "sum_tokens_in_value_count_group": 145,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 29
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.26772385835647583,
              "percentile_inc_nulls": 0.2686939239501953,
              "sum_tokens_in_value_count_group": 140,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 28
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2639821171760559,
              "percentile_inc_nulls": 0.264957070350647,
              "sum_tokens_in_value_count_group": 189,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 27
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.26037895679473877,
              "percentile_inc_nulls": 0.26135867834091187,
              "sum_tokens_in_value_count_group": 182,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 26
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2579042315483093,
              "percentile_inc_nulls": 0.25888729095458984,
              "sum_tokens_in_value_count_group": 125,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 25
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.25410306453704834,
              "percentile_inc_nulls": 0.25509113073349,
              "sum_tokens_in_value_count_group": 192,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 24
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.25091564655303955,
              "percentile_inc_nulls": 0.25190794467926025,
              "sum_tokens_in_value_count_group": 161,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 23
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.24612462520599365,
              "percentile_inc_nulls": 0.24712324142456055,
              "sum_tokens_in_value_count_group": 242,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 22
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2411355972290039,
              "percentile_inc_nulls": 0.24214082956314087,
              "sum_tokens_in_value_count_group": 252,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 21
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.23638415336608887,
              "percentile_inc_nulls": 0.23739570379257202,
              "sum_tokens_in_value_count_group": 240,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 20
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.22998952865600586,
              "percentile_inc_nulls": 0.23100954294204712,
              "sum_tokens_in_value_count_group": 323,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 19
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2228623628616333,
              "percentile_inc_nulls": 0.22389179468154907,
              "sum_tokens_in_value_count_group": 360,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 18
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.2188236117362976,
              "percentile_inc_nulls": 0.21985840797424316,
              "sum_tokens_in_value_count_group": 204,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 17
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.20995426177978516,
              "percentile_inc_nulls": 0.21100085973739624,
              "sum_tokens_in_value_count_group": 448,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 16
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.20193618535995483,
              "percentile_inc_nulls": 0.20299339294433594,
              "sum_tokens_in_value_count_group": 405,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 15
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.194452702999115,
              "percentile_inc_nulls": 0.1955198049545288,
              "sum_tokens_in_value_count_group": 378,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 14
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.18158423900604248,
              "percentile_inc_nulls": 0.18266832828521729,
              "sum_tokens_in_value_count_group": 650,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 13
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.17279404401779175,
              "percentile_inc_nulls": 0.17388981580734253,
              "sum_tokens_in_value_count_group": 444,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 12
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.16538971662521362,
              "percentile_inc_nulls": 0.16649532318115234,
              "sum_tokens_in_value_count_group": 374,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 11
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.15588682889938354,
              "percentile_inc_nulls": 0.15700501203536987,
              "sum_tokens_in_value_count_group": 480,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 10
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.14394885301589966,
              "percentile_inc_nulls": 0.14508283138275146,
              "sum_tokens_in_value_count_group": 603,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 9
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.13143670558929443,
              "percentile_inc_nulls": 0.13258731365203857,
              "sum_tokens_in_value_count_group": 632,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 8
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.12145870923995972,
              "percentile_inc_nulls": 0.12262248992919922,
              "sum_tokens_in_value_count_group": 504,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 7
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.11076796054840088,
              "percentile_inc_nulls": 0.11194592714309692,
              "sum_tokens_in_value_count_group": 540,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 6
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.10017621517181396,
              "percentile_inc_nulls": 0.10136818885803223,
              "sum_tokens_in_value_count_group": 535,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 5
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.08655542135238647,
              "percentile_inc_nulls": 0.08776545524597168,
              "sum_tokens_in_value_count_group": 688,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 4
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.0702817440032959,
              "percentile_inc_nulls": 0.07151329517364502,
              "sum_tokens_in_value_count_group": 822,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 3
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0.047276854515075684,
              "percentile_inc_nulls": 0.04853886365890503,
              "sum_tokens_in_value_count_group": 1162,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 2
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 0,
              "percentile_inc_nulls": 0.0013247132301330566,
              "sum_tokens_in_value_count_group": 2388,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 1
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "percentile_ex_nulls": 1,
              "percentile_inc_nulls": 1,
              "sum_tokens_in_value_count_group": 2780,
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value_count": 2780
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "percentile_ex_nulls",
              "type": "quantitative"
             },
             {
              "field": "percentile_inc_nulls",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "percentile_ex_nulls",
             "sort": "descending",
             "title": "Percentile",
             "type": "quantitative"
            },
            "y": {
             "field": "value_count",
             "title": "Count of values",
             "type": "quantitative"
            }
           },
           "mark": {
            "interpolate": "step-after",
            "type": "line"
           },
           "title": {
            "subtitle": "In this col, 67 values (0.1%) are null and there are 4413 distinct values",
            "text": "Distribution of counts of values in column first_name"
           }
          },
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "william",
              "value_count": 2780
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "john",
              "value_count": 2736
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "thomas",
              "value_count": 1448
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "george",
              "value_count": 1415
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "henry",
              "value_count": 1306
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "james",
              "value_count": 1265
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "sir",
              "value_count": 1262
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "charles",
              "value_count": 1216
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "edward",
              "value_count": 911
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "robert",
              "value_count": 884
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value",
              "type": "nominal"
             },
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "value",
             "sort": "-y",
             "title": null,
             "type": "nominal"
            },
            "y": {
             "field": "value_count",
             "title": "Value count",
             "type": "quantitative"
            }
           },
           "mark": "bar",
           "title": "Top 10 values by value count"
          },
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "clifford,",
              "value_count": 1
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "feank",
              "value_count": 1
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "jerald",
              "value_count": 1
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "watts",
              "value_count": 1
             },
             {
              "distinct_value_count": 4413,
              "group_name": "first_name",
              "total_non_null_rows": 50511,
              "total_rows_inc_nulls": 50578,
              "value": "pzul",
              "value_count": 1
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value",
              "type": "nominal"
             },
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "value",
             "sort": "-y",
             "title": null,
             "type": "nominal"
            },
            "y": {
             "field": "value_count",
             "scale": {
              "domain": [
               0,
               2780
              ]
             },
             "title": "Value count",
             "type": "quantitative"
            }
           },
           "mark": "bar",
           "title": "Bottom 5 values by value count"
          }
         ]
        },
        {
         "hconcat": [
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9991315603256226,
              "percentile_inc_nulls": 0.9993277788162231,
              "sum_tokens_in_value_count_group": 34,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 34
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9955557584762573,
              "percentile_inc_nulls": 0.9965597987174988,
              "sum_tokens_in_value_count_group": 140,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 28
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9941765666007996,
              "percentile_inc_nulls": 0.9954921007156372,
              "sum_tokens_in_value_count_group": 54,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 27
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9935380220413208,
              "percentile_inc_nulls": 0.9949977993965149,
              "sum_tokens_in_value_count_group": 25,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 25
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9929249882698059,
              "percentile_inc_nulls": 0.994523286819458,
              "sum_tokens_in_value_count_group": 24,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 24
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9923630952835083,
              "percentile_inc_nulls": 0.994088351726532,
              "sum_tokens_in_value_count_group": 22,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 22
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9912903308868408,
              "percentile_inc_nulls": 0.9932579398155212,
              "sum_tokens_in_value_count_group": 42,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 21
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9902687072753906,
              "percentile_inc_nulls": 0.9924671053886414,
              "sum_tokens_in_value_count_group": 40,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 20
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9892981052398682,
              "percentile_inc_nulls": 0.9917157888412476,
              "sum_tokens_in_value_count_group": 38,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 19
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9856201410293579,
              "percentile_inc_nulls": 0.9888686537742615,
              "sum_tokens_in_value_count_group": 144,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 18
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9830149412155151,
              "percentile_inc_nulls": 0.9868519902229309,
              "sum_tokens_in_value_count_group": 102,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 17
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9772936105728149,
              "percentile_inc_nulls": 0.9824231863021851,
              "sum_tokens_in_value_count_group": 224,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 16
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9642674922943115,
              "percentile_inc_nulls": 0.9723397493362427,
              "sum_tokens_in_value_count_group": 510,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 15
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9413822889328003,
              "percentile_inc_nulls": 0.9546245336532593,
              "sum_tokens_in_value_count_group": 896,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 14
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.9061861634254456,
              "percentile_inc_nulls": 0.9273794889450073,
              "sum_tokens_in_value_count_group": 1378,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 13
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.8565335273742676,
              "percentile_inc_nulls": 0.8889437913894653,
              "sum_tokens_in_value_count_group": 1944,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 12
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.7893849611282349,
              "percentile_inc_nulls": 0.8369646668434143,
              "sum_tokens_in_value_count_group": 2629,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 11
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.7117388248443604,
              "percentile_inc_nulls": 0.7768595218658447,
              "sum_tokens_in_value_count_group": 3040,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 10
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.6372599005699158,
              "percentile_inc_nulls": 0.7192059755325317,
              "sum_tokens_in_value_count_group": 2916,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 9
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.5657438039779663,
              "percentile_inc_nulls": 0.6638458967208862,
              "sum_tokens_in_value_count_group": 2800,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 8
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.4999489188194275,
              "percentile_inc_nulls": 0.6129146814346313,
              "sum_tokens_in_value_count_group": 2576,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 7
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.4349713921546936,
              "percentile_inc_nulls": 0.5626161694526672,
              "sum_tokens_in_value_count_group": 2544,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 6
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.3731610178947449,
              "percentile_inc_nulls": 0.5147692561149597,
              "sum_tokens_in_value_count_group": 2420,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 5
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.3146199584007263,
              "percentile_inc_nulls": 0.4694530963897705,
              "sum_tokens_in_value_count_group": 2292,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 4
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.24496835470199585,
              "percentile_inc_nulls": 0.41553640365600586,
              "sum_tokens_in_value_count_group": 2727,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 3
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0.1625204086303711,
              "percentile_inc_nulls": 0.35171419382095337,
              "sum_tokens_in_value_count_group": 3228,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 2
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 0,
              "percentile_inc_nulls": 0.22590851783752441,
              "sum_tokens_in_value_count_group": 6363,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 1
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "percentile_ex_nulls": 1,
              "percentile_inc_nulls": 1,
              "sum_tokens_in_value_count_group": 34,
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value_count": 34
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "percentile_ex_nulls",
              "type": "quantitative"
             },
             {
              "field": "percentile_inc_nulls",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "percentile_ex_nulls",
             "sort": "descending",
             "title": "Percentile",
             "type": "quantitative"
            },
            "y": {
             "field": "value_count",
             "title": "Count of values",
             "type": "quantitative"
            }
           },
           "mark": {
            "interpolate": "step-after",
            "type": "line"
           },
           "title": {
            "subtitle": "In this col, 11,426 values (22.6%) are null and there are 12363 distinct values",
            "text": "Distribution of counts of values in column postcode_fake"
           }
          },
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "se1 7sg",
              "value_count": 34
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "sw1p 3pl",
              "value_count": 28
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "l3 0ah",
              "value_count": 28
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "sw1a 2jh",
              "value_count": 28
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "sw1h 9aa",
              "value_count": 28
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "se1 8xz",
              "value_count": 28
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "pl1 3dq",
              "value_count": 27
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "sw1a 2bj",
              "value_count": 27
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "se1 7eh",
              "value_count": 25
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "s8 8ly",
              "value_count": 24
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value",
              "type": "nominal"
             },
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "value",
             "sort": "-y",
             "title": null,
             "type": "nominal"
            },
            "y": {
             "field": "value_count",
             "title": "Value count",
             "type": "quantitative"
            }
           },
           "mark": "bar",
           "title": "Top 10 values by value count"
          },
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "gl55 6js",
              "value_count": 1
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "b79 3tw",
              "value_count": 1
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "ch62 3ph",
              "value_count": 1
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "dn37 0pt",
              "value_count": 1
             },
             {
              "distinct_value_count": 12363,
              "group_name": "postcode_fake",
              "total_non_null_rows": 39152,
              "total_rows_inc_nulls": 50578,
              "value": "se1 9rq",
              "value_count": 1
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value",
              "type": "nominal"
             },
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "value",
             "sort": "-y",
             "title": null,
             "type": "nominal"
            },
            "y": {
             "field": "value_count",
             "scale": {
              "domain": [
               0,
               34
              ]
             },
             "title": "Value count",
             "type": "quantitative"
            }
           },
           "mark": "bar",
           "title": "Bottom 5 values by value count"
          }
         ]
        },
        {
         "hconcat": [
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.9430503845214844,
              "percentile_inc_nulls": 0.9558899402618408,
              "sum_tokens_in_value_count_group": 2231,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 2231
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.8983024954795837,
              "percentile_inc_nulls": 0.9212305545806885,
              "sum_tokens_in_value_count_group": 1753,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1753
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.8536056280136108,
              "percentile_inc_nulls": 0.886610746383667,
              "sum_tokens_in_value_count_group": 1751,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1751
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.8123548030853271,
              "percentile_inc_nulls": 0.854660153388977,
              "sum_tokens_in_value_count_group": 1616,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1616
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.7715379595756531,
              "percentile_inc_nulls": 0.8230456113815308,
              "sum_tokens_in_value_count_group": 1599,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1599
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.7308487892150879,
              "percentile_inc_nulls": 0.7915298938751221,
              "sum_tokens_in_value_count_group": 1594,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1594
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.6960816979408264,
              "percentile_inc_nulls": 0.7646012306213379,
              "sum_tokens_in_value_count_group": 1362,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1362
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.662922739982605,
              "percentile_inc_nulls": 0.7389180660247803,
              "sum_tokens_in_value_count_group": 1299,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1299
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.6321889162063599,
              "percentile_inc_nulls": 0.7151132822036743,
              "sum_tokens_in_value_count_group": 1204,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1204
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.6024760603904724,
              "percentile_inc_nulls": 0.6920993328094482,
              "sum_tokens_in_value_count_group": 1164,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1164
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.5732482671737671,
              "percentile_inc_nulls": 0.6694610118865967,
              "sum_tokens_in_value_count_group": 1145,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1145
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.5462412238121033,
              "percentile_inc_nulls": 0.6485428810119629,
              "sum_tokens_in_value_count_group": 1058,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1058
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.5202552676200867,
              "percentile_inc_nulls": 0.6284155249595642,
              "sum_tokens_in_value_count_group": 1018,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1018
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.500906229019165,
              "percentile_inc_nulls": 0.6134287714958191,
              "sum_tokens_in_value_count_group": 758,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 758
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.48467135429382324,
              "percentile_inc_nulls": 0.6008541584014893,
              "sum_tokens_in_value_count_group": 636,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 636
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.4693554639816284,
              "percentile_inc_nulls": 0.5889912843704224,
              "sum_tokens_in_value_count_group": 600,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 600
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.4556732773780823,
              "percentile_inc_nulls": 0.5783937573432922,
              "sum_tokens_in_value_count_group": 536,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 536
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.4441097378730774,
              "percentile_inc_nulls": 0.5694372653961182,
              "sum_tokens_in_value_count_group": 453,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 453
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.4328015446662903,
              "percentile_inc_nulls": 0.5606785416603088,
              "sum_tokens_in_value_count_group": 443,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 443
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.42269307374954224,
              "percentile_inc_nulls": 0.5528490543365479,
              "sum_tokens_in_value_count_group": 396,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 396
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.4129674434661865,
              "percentile_inc_nulls": 0.5453161597251892,
              "sum_tokens_in_value_count_group": 381,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 381
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.4039565920829773,
              "percentile_inc_nulls": 0.5383368134498596,
              "sum_tokens_in_value_count_group": 353,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 353
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.39514994621276855,
              "percentile_inc_nulls": 0.5315157175064087,
              "sum_tokens_in_value_count_group": 345,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 345
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.3872367739677429,
              "percentile_inc_nulls": 0.5253865718841553,
              "sum_tokens_in_value_count_group": 310,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 310
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.38008934259414673,
              "percentile_inc_nulls": 0.519850492477417,
              "sum_tokens_in_value_count_group": 280,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 280
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.3741927146911621,
              "percentile_inc_nulls": 0.5152833461761475,
              "sum_tokens_in_value_count_group": 231,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 231
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.36857688426971436,
              "percentile_inc_nulls": 0.5109336376190186,
              "sum_tokens_in_value_count_group": 220,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 220
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.36319082975387573,
              "percentile_inc_nulls": 0.5067617893218994,
              "sum_tokens_in_value_count_group": 211,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 211
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.35818761587142944,
              "percentile_inc_nulls": 0.5028866529464722,
              "sum_tokens_in_value_count_group": 196,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 196
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.3532865643501282,
              "percentile_inc_nulls": 0.4990904927253723,
              "sum_tokens_in_value_count_group": 192,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 192
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.34874284267425537,
              "percentile_inc_nulls": 0.49557119607925415,
              "sum_tokens_in_value_count_group": 178,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 178
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.34422463178634644,
              "percentile_inc_nulls": 0.49207162857055664,
              "sum_tokens_in_value_count_group": 177,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 177
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.33975750207901,
              "percentile_inc_nulls": 0.48861163854599,
              "sum_tokens_in_value_count_group": 175,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 175
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.33097636699676514,
              "percentile_inc_nulls": 0.4818102717399597,
              "sum_tokens_in_value_count_group": 344,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 172
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.32661134004592896,
              "percentile_inc_nulls": 0.47842937707901,
              "sum_tokens_in_value_count_group": 171,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 171
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.31823867559432983,
              "percentile_inc_nulls": 0.47194433212280273,
              "sum_tokens_in_value_count_group": 328,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 164
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.314103364944458,
              "percentile_inc_nulls": 0.46874135732650757,
              "sum_tokens_in_value_count_group": 162,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 162
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.31004464626312256,
              "percentile_inc_nulls": 0.4655976891517639,
              "sum_tokens_in_value_count_group": 159,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 159
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.3060370087623596,
              "percentile_inc_nulls": 0.4624935984611511,
              "sum_tokens_in_value_count_group": 157,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 157
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.3020803928375244,
              "percentile_inc_nulls": 0.4594290256500244,
              "sum_tokens_in_value_count_group": 155,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 155
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.29827696084976196,
              "percentile_inc_nulls": 0.45648306608200073,
              "sum_tokens_in_value_count_group": 149,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 149
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2944990396499634,
              "percentile_inc_nulls": 0.4535568952560425,
              "sum_tokens_in_value_count_group": 148,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 148
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.29074662923812866,
              "percentile_inc_nulls": 0.4506504535675049,
              "sum_tokens_in_value_count_group": 147,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 147
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.28704530000686646,
              "percentile_inc_nulls": 0.4477836489677429,
              "sum_tokens_in_value_count_group": 145,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 145
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2834460735321045,
              "percentile_inc_nulls": 0.44499582052230835,
              "sum_tokens_in_value_count_group": 141,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 141
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2799489498138428,
              "percentile_inc_nulls": 0.4422871470451355,
              "sum_tokens_in_value_count_group": 137,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 137
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2764773368835449,
              "percentile_inc_nulls": 0.4395982623100281,
              "sum_tokens_in_value_count_group": 136,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 136
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2731078267097473,
              "percentile_inc_nulls": 0.4369884133338928,
              "sum_tokens_in_value_count_group": 132,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 132
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2664709687232971,
              "percentile_inc_nulls": 0.43184781074523926,
              "sum_tokens_in_value_count_group": 260,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 130
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2631780505180359,
              "percentile_inc_nulls": 0.4292973279953003,
              "sum_tokens_in_value_count_group": 129,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 129
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.25993621349334717,
              "percentile_inc_nulls": 0.4267863631248474,
              "sum_tokens_in_value_count_group": 127,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 127
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.256924033164978,
              "percentile_inc_nulls": 0.4244533181190491,
              "sum_tokens_in_value_count_group": 118,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 118
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.25398850440979004,
              "percentile_inc_nulls": 0.42217957973480225,
              "sum_tokens_in_value_count_group": 115,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 115
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2512826919555664,
              "percentile_inc_nulls": 0.42008382081985474,
              "sum_tokens_in_value_count_group": 106,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 106
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.24862796068191528,
              "percentile_inc_nulls": 0.4180275797843933,
              "sum_tokens_in_value_count_group": 104,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 104
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.24599874019622803,
              "percentile_inc_nulls": 0.4159911274909973,
              "sum_tokens_in_value_count_group": 103,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 103
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.24339503049850464,
              "percentile_inc_nulls": 0.41397446393966675,
              "sum_tokens_in_value_count_group": 102,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 102
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.23308229446411133,
              "percentile_inc_nulls": 0.4059867858886719,
              "sum_tokens_in_value_count_group": 404,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 101
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.23058074712753296,
              "percentile_inc_nulls": 0.40404921770095825,
              "sum_tokens_in_value_count_group": 98,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 98
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.22810465097427368,
              "percentile_inc_nulls": 0.4021313786506653,
              "sum_tokens_in_value_count_group": 97,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 97
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.22567963600158691,
              "percentile_inc_nulls": 0.4002530574798584,
              "sum_tokens_in_value_count_group": 95,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 95
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2208806872367859,
              "percentile_inc_nulls": 0.39653605222702026,
              "sum_tokens_in_value_count_group": 188,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 94
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.21850669384002686,
              "percentile_inc_nulls": 0.39469730854034424,
              "sum_tokens_in_value_count_group": 93,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 93
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.21618378162384033,
              "percentile_inc_nulls": 0.3928980827331543,
              "sum_tokens_in_value_count_group": 91,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 91
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.21393746137619019,
              "percentile_inc_nulls": 0.39115822315216064,
              "sum_tokens_in_value_count_group": 88,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 88
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2117166519165039,
              "percentile_inc_nulls": 0.38943809270858765,
              "sum_tokens_in_value_count_group": 87,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 87
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2095213532447815,
              "percentile_inc_nulls": 0.3877377510070801,
              "sum_tokens_in_value_count_group": 86,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 86
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2052329182624817,
              "percentile_inc_nulls": 0.3844161629676819,
              "sum_tokens_in_value_count_group": 168,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 84
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.2032163143157959,
              "percentile_inc_nulls": 0.3828542232513428,
              "sum_tokens_in_value_count_group": 79,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 79
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.20122528076171875,
              "percentile_inc_nulls": 0.3813120126724243,
              "sum_tokens_in_value_count_group": 78,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 78
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.19933629035949707,
              "percentile_inc_nulls": 0.3798489570617676,
              "sum_tokens_in_value_count_group": 74,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 74
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.1974983811378479,
              "percentile_inc_nulls": 0.3784254193305969,
              "sum_tokens_in_value_count_group": 72,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 72
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.19568604230880737,
              "percentile_inc_nulls": 0.3770216107368469,
              "sum_tokens_in_value_count_group": 71,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 71
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.19211232662200928,
              "percentile_inc_nulls": 0.37425363063812256,
              "sum_tokens_in_value_count_group": 140,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 70
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.18858963251113892,
              "percentile_inc_nulls": 0.3715251684188843,
              "sum_tokens_in_value_count_group": 138,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 69
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.18685388565063477,
              "percentile_inc_nulls": 0.37018072605133057,
              "sum_tokens_in_value_count_group": 68,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 68
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.18343329429626465,
              "percentile_inc_nulls": 0.36753135919570923,
              "sum_tokens_in_value_count_group": 134,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 67
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.18174856901168823,
              "percentile_inc_nulls": 0.3662264347076416,
              "sum_tokens_in_value_count_group": 66,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 66
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.17541801929473877,
              "percentile_inc_nulls": 0.36132311820983887,
              "sum_tokens_in_value_count_group": 248,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 62
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.17386090755462646,
              "percentile_inc_nulls": 0.36011701822280884,
              "sum_tokens_in_value_count_group": 61,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 61
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.17232930660247803,
              "percentile_inc_nulls": 0.358930766582489,
              "sum_tokens_in_value_count_group": 60,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 60
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.16788768768310547,
              "percentile_inc_nulls": 0.355490505695343,
              "sum_tokens_in_value_count_group": 174,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 58
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.16497766971588135,
              "percentile_inc_nulls": 0.3532365560531616,
              "sum_tokens_in_value_count_group": 114,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 57
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.16216975450515747,
              "percentile_inc_nulls": 0.35106170177459717,
              "sum_tokens_in_value_count_group": 110,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 55
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.1607913374900818,
              "percentile_inc_nulls": 0.34999406337738037,
              "sum_tokens_in_value_count_group": 54,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 54
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.15808552503585815,
              "percentile_inc_nulls": 0.34789830446243286,
              "sum_tokens_in_value_count_group": 106,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 53
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.1567581295967102,
              "percentile_inc_nulls": 0.34687018394470215,
              "sum_tokens_in_value_count_group": 52,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 52
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.1554563045501709,
              "percentile_inc_nulls": 0.34586185216903687,
              "sum_tokens_in_value_count_group": 51,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 51
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.1516273021697998,
              "percentile_inc_nulls": 0.34289610385894775,
              "sum_tokens_in_value_count_group": 150,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 50
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.14912569522857666,
              "percentile_inc_nulls": 0.34095853567123413,
              "sum_tokens_in_value_count_group": 98,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 49
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.14667516946792603,
              "percentile_inc_nulls": 0.3390604853630066,
              "sum_tokens_in_value_count_group": 96,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 48
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.14427566528320312,
              "percentile_inc_nulls": 0.33720195293426514,
              "sum_tokens_in_value_count_group": 94,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 47
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.14310145378112793,
              "percentile_inc_nulls": 0.33629244565963745,
              "sum_tokens_in_value_count_group": 46,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 46
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.1419527530670166,
              "percentile_inc_nulls": 0.3354027271270752,
              "sum_tokens_in_value_count_group": 45,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 45
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.13858330249786377,
              "percentile_inc_nulls": 0.3327929377555847,
              "sum_tokens_in_value_count_group": 132,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 44
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.13638800382614136,
              "percentile_inc_nulls": 0.33109259605407715,
              "sum_tokens_in_value_count_group": 86,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 43
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.1331716775894165,
              "percentile_inc_nulls": 0.3286013603210449,
              "sum_tokens_in_value_count_group": 126,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 42
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.13107848167419434,
              "percentile_inc_nulls": 0.3269801139831543,
              "sum_tokens_in_value_count_group": 82,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 41
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.12903636693954468,
              "percentile_inc_nulls": 0.32539838552474976,
              "sum_tokens_in_value_count_group": 80,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 40
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.12704533338546753,
              "percentile_inc_nulls": 0.3238562345504761,
              "sum_tokens_in_value_count_group": 78,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 39
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.12510532140731812,
              "percentile_inc_nulls": 0.3223536014556885,
              "sum_tokens_in_value_count_group": 76,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 38
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.12038290500640869,
              "percentile_inc_nulls": 0.31869590282440186,
              "sum_tokens_in_value_count_group": 185,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 37
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.11946392059326172,
              "percentile_inc_nulls": 0.31798410415649414,
              "sum_tokens_in_value_count_group": 36,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 36
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.11499679088592529,
              "percentile_inc_nulls": 0.3145241141319275,
              "sum_tokens_in_value_count_group": 175,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 35
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.11152517795562744,
              "percentile_inc_nulls": 0.3118351697921753,
              "sum_tokens_in_value_count_group": 136,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 34
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.10899806022644043,
              "percentile_inc_nulls": 0.30987781286239624,
              "sum_tokens_in_value_count_group": 99,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 33
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.10409700870513916,
              "percentile_inc_nulls": 0.30608171224594116,
              "sum_tokens_in_value_count_group": 192,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 32
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.10093170404434204,
              "percentile_inc_nulls": 0.3036300539970398,
              "sum_tokens_in_value_count_group": 124,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 31
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.09786856174468994,
              "percentile_inc_nulls": 0.30125749111175537,
              "sum_tokens_in_value_count_group": 120,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 30
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.09268665313720703,
              "percentile_inc_nulls": 0.2972438335418701,
              "sum_tokens_in_value_count_group": 203,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 29
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.08982771635055542,
              "percentile_inc_nulls": 0.2950294613838196,
              "sum_tokens_in_value_count_group": 112,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 28
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.08638161420822144,
              "percentile_inc_nulls": 0.2923603057861328,
              "sum_tokens_in_value_count_group": 135,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 27
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.08306318521499634,
              "percentile_inc_nulls": 0.2897900342941284,
              "sum_tokens_in_value_count_group": 130,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 26
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.08051055669784546,
              "percentile_inc_nulls": 0.28781288862228394,
              "sum_tokens_in_value_count_group": 100,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 25
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.07622206211090088,
              "percentile_inc_nulls": 0.28449130058288574,
              "sum_tokens_in_value_count_group": 168,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 24
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.07328653335571289,
              "percentile_inc_nulls": 0.2822175621986389,
              "sum_tokens_in_value_count_group": 115,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 23
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.06935542821884155,
              "percentile_inc_nulls": 0.27917277812957764,
              "sum_tokens_in_value_count_group": 154,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 22
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.06506699323654175,
              "percentile_inc_nulls": 0.27585119009017944,
              "sum_tokens_in_value_count_group": 168,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 21
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.06047224998474121,
              "percentile_inc_nulls": 0.2722923159599304,
              "sum_tokens_in_value_count_group": 180,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 20
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.05465221405029297,
              "percentile_inc_nulls": 0.2677844166755676,
              "sum_tokens_in_value_count_group": 228,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 19
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.05097639560699463,
              "percentile_inc_nulls": 0.2649373412132263,
              "sum_tokens_in_value_count_group": 144,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 18
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.04533505439758301,
              "percentile_inc_nulls": 0.260567843914032,
              "sum_tokens_in_value_count_group": 221,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 17
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.04329293966293335,
              "percentile_inc_nulls": 0.25898611545562744,
              "sum_tokens_in_value_count_group": 80,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 16
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.03793233633041382,
              "percentile_inc_nulls": 0.2548341155052185,
              "sum_tokens_in_value_count_group": 210,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 15
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.033643901348114014,
              "percentile_inc_nulls": 0.2515125274658203,
              "sum_tokens_in_value_count_group": 168,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 14
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.029329955577850342,
              "percentile_inc_nulls": 0.2481711506843567,
              "sum_tokens_in_value_count_group": 169,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 13
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.02473515272140503,
              "percentile_inc_nulls": 0.24461227655410767,
              "sum_tokens_in_value_count_group": 180,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 12
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.020804107189178467,
              "percentile_inc_nulls": 0.2415674924850464,
              "sum_tokens_in_value_count_group": 154,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 11
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.01850670576095581,
              "percentile_inc_nulls": 0.23978805541992188,
              "sum_tokens_in_value_count_group": 90,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 10
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.015749812126159668,
              "percentile_inc_nulls": 0.2376527190208435,
              "sum_tokens_in_value_count_group": 108,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 9
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.01370769739151001,
              "percentile_inc_nulls": 0.23607099056243896,
              "sum_tokens_in_value_count_group": 80,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 8
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.010848760604858398,
              "percentile_inc_nulls": 0.23385661840438843,
              "sum_tokens_in_value_count_group": 112,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 7
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.007938742637634277,
              "percentile_inc_nulls": 0.23160266876220703,
              "sum_tokens_in_value_count_group": 114,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 6
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.0060242414474487305,
              "percentile_inc_nulls": 0.23011982440948486,
              "sum_tokens_in_value_count_group": 75,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 5
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.004084229469299316,
              "percentile_inc_nulls": 0.22861719131469727,
              "sum_tokens_in_value_count_group": 76,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 4
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.002552628517150879,
              "percentile_inc_nulls": 0.22743088006973267,
              "sum_tokens_in_value_count_group": 60,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 3
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0.0017358064651489258,
              "percentile_inc_nulls": 0.22679823637008667,
              "sum_tokens_in_value_count_group": 32,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 2
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 0,
              "percentile_inc_nulls": 0.22545373439788818,
              "sum_tokens_in_value_count_group": 68,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 1
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "percentile_ex_nulls": 1,
              "percentile_inc_nulls": 1,
              "sum_tokens_in_value_count_group": 2231,
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value_count": 2231
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "percentile_ex_nulls",
              "type": "quantitative"
             },
             {
              "field": "percentile_inc_nulls",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "percentile_ex_nulls",
             "sort": "descending",
             "title": "Percentile",
             "type": "quantitative"
            },
            "y": {
             "field": "value_count",
             "title": "Count of values",
             "type": "quantitative"
            }
           },
           "mark": {
            "interpolate": "step-after",
            "type": "line"
           },
           "title": {
            "subtitle": "In this col, 11,403 values (22.5%) are null and there are 537 distinct values",
            "text": "Distribution of counts of values in column substr(dob, 1,4)"
           }
          },
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1862",
              "value_count": 2231
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1860",
              "value_count": 1753
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1861",
              "value_count": 1751
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1859",
              "value_count": 1616
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1857",
              "value_count": 1599
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1858",
              "value_count": 1594
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1856",
              "value_count": 1362
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1855",
              "value_count": 1299
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1851",
              "value_count": 1204
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1850",
              "value_count": 1164
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value",
              "type": "nominal"
             },
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "value",
             "sort": "-y",
             "title": null,
             "type": "nominal"
            },
            "y": {
             "field": "value_count",
             "title": "Value count",
             "type": "quantitative"
            }
           },
           "mark": "bar",
           "title": "Top 10 values by value count"
          },
          {
           "data": {
            "values": [
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1092",
              "value_count": 1
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1085",
              "value_count": 1
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1338",
              "value_count": 1
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1262",
              "value_count": 1
             },
             {
              "distinct_value_count": 537,
              "group_name": "substr_dob_1_4_",
              "total_non_null_rows": 39175,
              "total_rows_inc_nulls": 50578,
              "value": "1295",
              "value_count": 1
             }
            ]
           },
           "encoding": {
            "tooltip": [
             {
              "field": "value",
              "type": "nominal"
             },
             {
              "field": "value_count",
              "type": "quantitative"
             },
             {
              "field": "total_non_null_rows",
              "type": "quantitative"
             },
             {
              "field": "total_rows_inc_nulls",
              "type": "quantitative"
             }
            ],
            "x": {
             "field": "value",
             "sort": "-y",
             "title": null,
             "type": "nominal"
            },
            "y": {
             "field": "value_count",
             "scale": {
              "domain": [
               0,
               2231
              ]
             },
             "title": "Value count",
             "type": "quantitative"
            }
           },
           "mark": "bar",
           "title": "Bottom 5 values by value count"
          }
         ]
        }
       ]
      },
      "image/png": "",
      "text/plain": [
       "<VegaLite 4 object>\n",
       "\n",
       "If you see this message, it means the renderer has not been properly enabled\n",
       "for the frontend that you are using. For more information, see\n",
       "https://altair-viz.github.io/user_guide/troubleshooting.html\n"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Initialise the linker, passing in the input dataset(s)\n",
    "linker = DuckDBLinker(df_clean, connection=\":temporary:\")\n",
    "\n",
    "import altair as alt\n",
    "alt.renderers.enable('mimetype')\n",
    "linker.profile_columns([\"first_name\", \"postcode_fake\", \"substr(dob, 1,4)\"], top_n=10, bottom_n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'count_of_pairwise_comparisons_generated': 16372982}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "linker.compute_number_of_comparisons_generated_by_blocking_rule(\"l.first_name = r.first_name\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'count_of_pairwise_comparisons_generated': 243656}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "linker.compute_number_of_comparisons_generated_by_blocking_rule(\"l.first_name = r.first_name and l.surname = r.surname\",)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import splink.duckdb.duckdb_comparison_library as cl\n",
    "\n",
    "settings = {\n",
    "    \"probability_two_random_records_match\": 9/50_000,\n",
    "    \"link_type\": \"dedupe_only\",\n",
    "    \"blocking_rules_to_generate_predictions\": [\n",
    "        \"l.first_name = r.first_name and l.surname = r.surname\",\n",
    "        \"l.surname = r.surname and l.dob = r.dob\",\n",
    "        \"l.first_name = r.first_name and l.dob = r.dob\",\n",
    "        \"l.postcode_fake = r.postcode_fake and l.first_name = r.first_name\",\n",
    "    ],\n",
    "    \"comparisons\": [\n",
    "        cl.jaccard_at_thresholds(\"first_name\", [0.9, 0.5], term_frequency_adjustments=False),\n",
    "        cl.jaccard_at_thresholds(\"surname\", [0.9, 0.5], term_frequency_adjustments=False),\n",
    "        cl.levenshtein_at_thresholds(\"dob\", [1,2], term_frequency_adjustments=False),\n",
    "        cl.levenshtein_at_thresholds(\"postcode_fake\", 2),\n",
    "        cl.exact_match(\"birth_place\", term_frequency_adjustments=False),\n",
    "        cl.exact_match(\"occupation\",  term_frequency_adjustments=False),\n",
    "    ],\n",
    "    \"retain_matching_columns\": True,\n",
    "    \"retain_intermediate_calculation_columns\": True,\n",
    "    \"max_iterations\": 10,\n",
    "    \"em_convergence\": 0.01\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.initialise_settings(settings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "----- Estimating u probabilities using random sampling -----\n",
      "\n",
      "Estimated u probabilities using random sampling\n",
      "\n",
      "Your model is not yet fully trained. Missing estimates for:\n",
      "    - first_name (no m values are trained).\n",
      "    - surname (no m values are trained).\n",
      "    - dob (no m values are trained).\n",
      "    - postcode_fake (no m values are trained).\n",
      "    - birth_place (no m values are trained).\n",
      "    - occupation (no m values are trained).\n"
     ]
    }
   ],
   "source": [
    "linker.estimate_u_using_random_sampling(target_rows=5e6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "----- Starting EM training session -----\n",
      "\n",
      "Estimating the m probabilities of the model by blocking on:\n",
      "l.first_name = r.first_name and l.surname = r.surname\n",
      "\n",
      "Parameter estimates will be made for the following comparison(s):\n",
      "    - dob\n",
      "    - postcode_fake\n",
      "    - birth_place\n",
      "    - occupation\n",
      "\n",
      "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
      "    - first_name\n",
      "    - surname\n",
      "\n",
      "Iteration 1: Largest change in params was -0.527 in probability_two_random_records_match\n",
      "Iteration 2: Largest change in params was -0.0345 in probability_two_random_records_match\n",
      "Iteration 3: Largest change in params was -0.0147 in the m_probability of birth_place, level `All other comparisons`\n",
      "Iteration 4: Largest change in params was -0.00748 in the m_probability of dob, level `All other comparisons`\n",
      "\n",
      "EM converged after 4 iterations\n",
      "\n",
      "Your model is not yet fully trained. Missing estimates for:\n",
      "    - first_name (no m values are trained).\n",
      "    - surname (no m values are trained).\n"
     ]
    },
    {
     "data": {
      "application/vnd.vegalite.v4+json": {
       "$schema": "https://vega.github.io/schema/vega-lite/v5.2.json",
       "config": {
        "header": {
         "title": null
        },
        "mark": {
         "tooltip": null
        },
        "title": {
         "anchor": "middle"
        },
        "view": {
         "height": 60,
         "width": 400
        }
       },
       "data": {
        "values": [
         {
          "bayes_factor": 15.7452908230003,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.940 or one in  1.1 records.This is equivalent to a starting match weight of 3.977.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "",
          "log2_bayes_factor": 3.9768484998694476,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 413.00081741898146,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 413.00 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.690000826831415,
          "m_probability": 0.95,
          "m_probability_description": "Amongst matching record comparisons, 95.00% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "dob_l = dob_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00230023757806814,
          "u_probability_description": "Amongst non-matching record comparisons, 0.23% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.7207671310421591,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is  1.39 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": -0.47239487308621353,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the levenshtein <= 1 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.023123510977211607,
          "u_probability_description": "Amongst non-matching record comparisons, 2.31% of records are in the levenshtein <= 1 comparison level"
         },
         {
          "bayes_factor": 0.20942716239366294,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is  4.77 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": -2.255479525067184,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.07958216344133114,
          "u_probability_description": "Amongst non-matching record comparisons, 7.96% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.018622096939039857,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  53.70 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.746840653225018,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.8949940880033891,
          "u_probability_description": "Amongst non-matching record comparisons, 89.50% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 6146.433155650319,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 6,146.43 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.585533725769425,
          "m_probability": 0.95,
          "m_probability_description": "Amongst matching record comparisons, 95.00% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 40.74109828141787,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 40.74 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 5.348412967282423,
          "m_probability": 0.025000000000000022,
          "m_probability_description": "Amongst matching record comparisons, 2.50% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.025019219568439496,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  39.97 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.320819401961741,
          "m_probability": 0.025000000000000022,
          "m_probability_description": "Amongst matching record comparisons, 2.50% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 176.24053930297634,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 176.24 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.461402004636435,
          "m_probability": 0.95,
          "m_probability_description": "Amongst matching record comparisons, 95.00% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.05027097868595126,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  19.89 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.314130413778557,
          "m_probability": 0.050000000000000044,
          "m_probability_description": "Amongst matching record comparisons, 5.00% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 27.29236082754866,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 27.29 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.770425290044769,
          "m_probability": 0.95,
          "m_probability_description": "Amongst matching record comparisons, 95.00% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.051803179309210814,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  19.30 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.270815546826654,
          "m_probability": 0.050000000000000044,
          "m_probability_description": "Amongst matching record comparisons, 5.00% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.7051524208308775,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.414 or one in  2.4 records.This is equivalent to a starting match weight of -0.504.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "",
          "log2_bayes_factor": -0.5039929607667886,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 257.9240183540208,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 257.92 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.010802315545499,
          "m_probability": 0.5932865193042552,
          "m_probability_description": "Amongst matching record comparisons, 59.33% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "dob_l = dob_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00230023757806814,
          "u_probability_description": "Amongst non-matching record comparisons, 0.23% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 13.706245226160528,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is 13.71 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": 3.7767614993004504,
          "m_probability": 0.31693651194347716,
          "m_probability_description": "Amongst matching record comparisons, 31.69% of records are in the levenshtein <= 1 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.023123510977211607,
          "u_probability_description": "Amongst non-matching record comparisons, 2.31% of records are in the levenshtein <= 1 comparison level"
         },
         {
          "bayes_factor": 0.5132844092216932,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is  1.95 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": -0.9621696548729666,
          "m_probability": 0.040848283746567876,
          "m_probability_description": "Amongst matching record comparisons, 4.08% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.07958216344133114,
          "u_probability_description": "Amongst non-matching record comparisons, 7.96% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.05466928291676144,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  18.29 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.193125737581238,
          "m_probability": 0.04892868500588616,
          "m_probability_description": "Amongst matching record comparisons, 4.89% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.8949940880033891,
          "u_probability_description": "Amongst non-matching record comparisons, 89.50% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4357.2564993278165,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,357.26 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.08920432777886,
          "m_probability": 0.6734627335133624,
          "m_probability_description": "Amongst matching record comparisons, 67.35% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 225.60994603724862,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 225.61 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.817686860269941,
          "m_probability": 0.13844125192628284,
          "m_probability_description": "Amongst matching record comparisons, 13.84% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.1882406195293925,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.31 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.4093501210691257,
          "m_probability": 0.18809601456039107,
          "m_probability_description": "Amongst matching record comparisons, 18.81% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 144.06962885102888,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 144.07 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.170622424495475,
          "m_probability": 0.7765872026366752,
          "m_probability_description": "Amongst matching record comparisons, 77.66% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.22462359948858213,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  4.45 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.1544185862202636,
          "m_probability": 0.22341279736349723,
          "m_probability_description": "Amongst matching record comparisons, 22.34% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 25.810054400191795,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.81 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6898612771098,
          "m_probability": 0.8984034703011985,
          "m_probability_description": "Amongst matching record comparisons, 89.84% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.10526046490393261,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  9.50 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.2479644229782685,
          "m_probability": 0.10159652969911151,
          "m_probability_description": "Amongst matching record comparisons, 10.16% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.6103957500270338,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.379 or one in  2.6 records.This is equivalent to a starting match weight of -0.712.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "",
          "log2_bayes_factor": -0.7121831776629327,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 259.14246234679655,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 259.14 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.017601620568344,
          "m_probability": 0.5960892299632095,
          "m_probability_description": "Amongst matching record comparisons, 59.61% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "dob_l = dob_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00230023757806814,
          "u_probability_description": "Amongst non-matching record comparisons, 0.23% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 14.309529621181515,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is 14.31 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": 3.8389043438672306,
          "m_probability": 0.3308865652741254,
          "m_probability_description": "Amongst matching record comparisons, 33.09% of records are in the levenshtein <= 1 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.023123510977211607,
          "u_probability_description": "Amongst non-matching record comparisons, 2.31% of records are in the levenshtein <= 1 comparison level"
         },
         {
          "bayes_factor": 0.48568710774004464,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is  2.06 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": -1.0419009034533753,
          "m_probability": 0.03865203078951564,
          "m_probability_description": "Amongst matching record comparisons, 3.87% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.07958216344133114,
          "u_probability_description": "Amongst non-matching record comparisons, 7.96% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.03840491734448192,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  26.04 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.702565145122343,
          "m_probability": 0.03437217397357013,
          "m_probability_description": "Amongst matching record comparisons, 3.44% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.8949940880033891,
          "u_probability_description": "Amongst non-matching record comparisons, 89.50% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4316.411377384189,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,316.41 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.075616652692302,
          "m_probability": 0.6671496630115258,
          "m_probability_description": "Amongst matching record comparisons, 66.71% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 224.33667729624497,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 224.34 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.809521698864289,
          "m_probability": 0.13765993478295954,
          "m_probability_description": "Amongst matching record comparisons, 13.77% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.19534046121766735,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.12 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.355937286716486,
          "m_probability": 0.19519040220591036,
          "m_probability_description": "Amongst matching record comparisons, 19.52% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 149.8252267246725,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 149.83 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.227136746858861,
          "m_probability": 0.8076119487114797,
          "m_probability_description": "Amongst matching record comparisons, 80.76% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.19343071251560998,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.17 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.370111213846395,
          "m_probability": 0.19238805128898986,
          "m_probability_description": "Amongst matching record comparisons, 19.24% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 26.080195723205453,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 26.08 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.704882791444019,
          "m_probability": 0.9078066237507869,
          "m_probability_description": "Amongst matching record comparisons, 90.78% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.09551820001953051,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  10.47 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.3880805395975155,
          "m_probability": 0.09219337624954138,
          "m_probability_description": "Amongst matching record comparisons, 9.22% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.5901235425012258,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.371 or one in  2.7 records.This is equivalent to a starting match weight of -0.761.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "",
          "log2_bayes_factor": -0.7609110802503842,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 263.0115618703464,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 263.01 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.038982410913514,
          "m_probability": 0.6049890780805642,
          "m_probability_description": "Amongst matching record comparisons, 60.50% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "dob_l = dob_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00230023757806814,
          "u_probability_description": "Amongst non-matching record comparisons, 0.23% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 14.469817973916678,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is 14.47 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": 3.8549748681889637,
          "m_probability": 0.3345929947581161,
          "m_probability_description": "Amongst matching record comparisons, 33.46% of records are in the levenshtein <= 1 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.023123510977211607,
          "u_probability_description": "Amongst non-matching record comparisons, 2.31% of records are in the levenshtein <= 1 comparison level"
         },
         {
          "bayes_factor": 0.47761167275868416,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is  2.09 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": -1.0660899986002241,
          "m_probability": 0.03800937020296916,
          "m_probability_description": "Amongst matching record comparisons, 3.80% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.07958216344133114,
          "u_probability_description": "Amongst non-matching record comparisons, 7.96% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.025037659196964417,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  39.94 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.319756500618782,
          "m_probability": 0.022408556958726837,
          "m_probability_description": "Amongst matching record comparisons, 2.24% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.8949940880033891,
          "u_probability_description": "Amongst non-matching record comparisons, 89.50% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4363.6065049448525,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,363.61 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.09130529490284,
          "m_probability": 0.6744441979144904,
          "m_probability_description": "Amongst matching record comparisons, 67.44% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 226.47303675947256,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 226.47 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.82319548707074,
          "m_probability": 0.13897087113061932,
          "m_probability_description": "Amongst matching record comparisons, 13.90% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.18672837422942887,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.36 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.420986926563895,
          "m_probability": 0.18658493095541798,
          "m_probability_description": "Amongst matching record comparisons, 18.66% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 152.55265993455592,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 152.55 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.253163524990785,
          "m_probability": 0.8223137963092958,
          "m_probability_description": "Amongst matching record comparisons, 82.23% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.17864918717081726,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.60 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.484798745012871,
          "m_probability": 0.17768620369106,
          "m_probability_description": "Amongst matching record comparisons, 17.77% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 26.080210061831067,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 26.08 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.704883584622821,
          "m_probability": 0.9078071228536098,
          "m_probability_description": "Amongst matching record comparisons, 90.78% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.09551768291739937,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  10.47 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.388088349865082,
          "m_probability": 0.0921928771468434,
          "m_probability_description": "Amongst matching record comparisons, 9.22% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.5805679153059622,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.367 or one in  2.7 records.This is equivalent to a starting match weight of -0.784.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "",
          "log2_bayes_factor": -0.7844632502875872,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 265.32189199903297,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 265.32 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.051599908595803,
          "m_probability": 0.6103033862603122,
          "m_probability_description": "Amongst matching record comparisons, 61.03% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "dob_l = dob_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00230023757806814,
          "u_probability_description": "Amongst non-matching record comparisons, 0.23% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 14.570450186831058,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is 14.57 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": 3.8649735482866947,
          "m_probability": 0.3369199648381029,
          "m_probability_description": "Amongst matching record comparisons, 33.69% of records are in the levenshtein <= 1 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.023123510977211607,
          "u_probability_description": "Amongst non-matching record comparisons, 2.31% of records are in the levenshtein <= 1 comparison level"
         },
         {
          "bayes_factor": 0.4755960375095446,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is  2.10 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": -1.0721913997557366,
          "m_probability": 0.037848961589134034,
          "m_probability_description": "Amongst matching record comparisons, 3.78% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.07958216344133114,
          "u_probability_description": "Amongst non-matching record comparisons, 7.96% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.016679090412197704,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  59.96 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.905815575667971,
          "m_probability": 0.014927687312190956,
          "m_probability_description": "Amongst matching record comparisons, 1.49% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.8949940880033891,
          "u_probability_description": "Amongst non-matching record comparisons, 89.50% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4392.5096861589245,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,392.51 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.100829752263532,
          "m_probability": 0.6789115078905416,
          "m_probability_description": "Amongst matching record comparisons, 67.89% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 227.80086824722986,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 227.80 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.8316294355634435,
          "m_probability": 0.1397856696656179,
          "m_probability_description": "Amongst matching record comparisons, 13.98% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.1814422049238122,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.51 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.462418017454674,
          "m_probability": 0.18130282244364318,
          "m_probability_description": "Amongst matching record comparisons, 18.13% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 153.8672720379109,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 153.87 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.26554258914858,
          "m_probability": 0.8294000291540573,
          "m_probability_description": "Amongst matching record comparisons, 82.94% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.17152454996427302,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.83 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.5435130136675363,
          "m_probability": 0.17059997084581074,
          "m_probability_description": "Amongst matching record comparisons, 17.06% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 26.062190740951976,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 26.06 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.703886454244968,
          "m_probability": 0.9071799013778532,
          "m_probability_description": "Amongst matching record comparisons, 90.72% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.09616752424856101,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  10.40 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.3783064113274013,
          "m_probability": 0.09282009862226943,
          "m_probability_description": "Amongst matching record comparisons, 9.28% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.36731601956729415,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         }
        ]
       },
       "params": [
        {
         "bind": {
          "input": "range",
          "max": 4,
          "min": 0,
          "step": 1
         },
         "description": "Filter by the interation number",
         "name": "iteration_number",
         "value": 4
        }
       ],
       "resolve": {
        "axis": {
         "y": "independent"
        },
        "scale": {
         "y": "independent"
        }
       },
       "selection": {
        "zoom_selector": {
         "bind": "scales",
         "encodings": [
          "x"
         ],
         "type": "interval"
        }
       },
       "title": {
        "subtitle": "Training session blocked on l.first_name = r.first_name and l.surname = r.surname",
        "text": "Model parameters (components of final match weight)"
       },
       "transform": [
        {
         "filter": "(datum.iteration == iteration_number)"
        }
       ],
       "vconcat": [
        {
         "encoding": {
          "color": {
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             0,
             10
            ],
            "range": [
             "red",
             "orange",
             "green"
            ]
           },
           "title": "Match weight",
           "type": "quantitative"
          },
          "tooltip": [
           {
            "field": "comparison_name",
            "title": "Comparison name",
            "type": "nominal"
           },
           {
            "field": "probability_two_random_records_match",
            "format": ".4f",
            "title": "Probability two random records match",
            "type": "nominal"
           },
           {
            "field": "log2_bayes_factor",
            "format": ",.4f",
            "title": "Equivalent match weight",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor_description",
            "title": "Match weight description",
            "type": "nominal"
           }
          ],
          "x": {
           "axis": {
            "domain": false,
            "labels": false,
            "ticks": false,
            "title": ""
           },
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             10
            ]
           },
           "type": "quantitative"
          },
          "y": {
           "axis": {
            "title": "Prior (starting) match weight",
            "titleAlign": "right",
            "titleAngle": 0,
            "titleFontWeight": "normal"
           },
           "field": "label_for_charts",
           "sort": {
            "field": "comparison_vector_value",
            "order": "descending"
           },
           "type": "nominal"
          }
         },
         "height": 30,
         "mark": {
          "clip": true,
          "height": 20,
          "type": "bar"
         },
         "selection": {
          "zoom_selector": {
           "bind": "scales",
           "encodings": [
            "x"
           ],
           "type": "interval"
          }
         },
         "transform": [
          {
           "filter": "(datum.comparison_name == 'probability_two_random_records_match')"
          }
         ]
        },
        {
         "encoding": {
          "color": {
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             0,
             10
            ],
            "range": [
             "red",
             "orange",
             "green"
            ]
           },
           "title": "Match weight",
           "type": "quantitative"
          },
          "row": {
           "field": "comparison_name",
           "header": {
            "labelAlign": "left",
            "labelAnchor": "middle",
            "labelAngle": 0
           },
           "sort": {
            "field": "comparison_sort_order"
           },
           "type": "nominal"
          },
          "tooltip": [
           {
            "field": "comparison_name",
            "title": "Comparison name",
            "type": "nominal"
           },
           {
            "field": "label_for_charts",
            "title": "Label",
            "type": "ordinal"
           },
           {
            "field": "sql_condition",
            "title": "SQL condition",
            "type": "nominal"
           },
           {
            "field": "m_probability",
            "format": ".4f",
            "title": "M probability",
            "type": "quantitative"
           },
           {
            "field": "u_probability",
            "format": ".4f",
            "title": "U probability",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor",
            "format": ",.4f",
            "title": "Bayes factor = m/u",
            "type": "quantitative"
           },
           {
            "field": "log2_bayes_factor",
            "format": ",.4f",
            "title": "Match weight = log2(m/u)",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor_description",
            "title": "Match weight description",
            "type": "nominal"
           }
          ],
          "x": {
           "axis": {
            "title": "Comparison level match weight = log2(m/u)"
           },
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             10
            ]
           },
           "type": "quantitative"
          },
          "y": {
           "axis": {
            "title": null
           },
           "field": "label_for_charts",
           "sort": {
            "field": "comparison_vector_value",
            "order": "descending"
           },
           "type": "nominal"
          }
         },
         "mark": {
          "clip": true,
          "type": "bar"
         },
         "resolve": {
          "axis": {
           "y": "independent"
          },
          "scale": {
           "y": "independent"
          }
         },
         "selection": {
          "zoom_selector": {
           "bind": "scales",
           "encodings": [
            "x"
           ],
           "type": "interval"
          }
         },
         "transform": [
          {
           "filter": "(datum.comparison_name != 'probability_two_random_records_match')"
          }
         ]
        }
       ]
      },
      "image/png": "",
      "text/plain": [
       "<VegaLite 4 object>\n",
       "\n",
       "If you see this message, it means the renderer has not been properly enabled\n",
       "for the frontend that you are using. For more information, see\n",
       "https://altair-viz.github.io/user_guide/troubleshooting.html\n"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "blocking_rule = \"l.first_name = r.first_name and l.surname = r.surname\"\n",
    "training_session_names = linker.estimate_parameters_using_expectation_maximisation(blocking_rule)\n",
    "training_session_names.match_weights_interactive_history_chart()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "----- Starting EM training session -----\n",
      "\n",
      "Estimating the m probabilities of the model by blocking on:\n",
      "l.dob = r.dob\n",
      "\n",
      "Parameter estimates will be made for the following comparison(s):\n",
      "    - first_name\n",
      "    - surname\n",
      "    - postcode_fake\n",
      "    - birth_place\n",
      "    - occupation\n",
      "\n",
      "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
      "    - dob\n",
      "\n",
      "Iteration 1: Largest change in params was -0.312 in the m_probability of first_name, level `Exact match`\n",
      "Iteration 2: Largest change in params was -0.0708 in the m_probability of first_name, level `Exact match`\n",
      "Iteration 3: Largest change in params was -0.0115 in the m_probability of surname, level `Exact match`\n",
      "Iteration 4: Largest change in params was -0.00293 in the m_probability of surname, level `Exact match`\n",
      "\n",
      "EM converged after 4 iterations\n",
      "\n",
      "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
     ]
    },
    {
     "data": {
      "application/vnd.vegalite.v4+json": {
       "$schema": "https://vega.github.io/schema/vega-lite/v5.2.json",
       "config": {
        "header": {
         "title": null
        },
        "mark": {
         "tooltip": null
        },
        "title": {
         "anchor": "middle"
        },
        "view": {
         "height": 60,
         "width": 400
        }
       },
       "data": {
        "values": [
         {
          "bayes_factor": 0.0017612707196973728,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.002 or one in  568.8 records.This is equivalent to a starting match weight of -9.149.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "",
          "log2_bayes_factor": -9.149167606073906,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 72.76983736674188,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 72.77 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 6.185268681319293,
          "m_probability": 0.95,
          "m_probability_description": "Amongst matching record comparisons, 95.00% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "first_name_l = first_name_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.01305485946343725,
          "u_probability_description": "Amongst non-matching record comparisons, 1.31% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 39.552571520411476,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 39.55 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 5.305699589816577,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00042138010313857066,
          "u_probability_description": "Amongst non-matching record comparisons, 0.04% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 0.46763749064140164,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is  2.14 times less likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": -1.096537499076254,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.035640142204610314,
          "u_probability_description": "Amongst non-matching record comparisons, 3.56% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.017527556839932994,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  57.05 times less likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.834231276424401,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9508836182288138,
          "u_probability_description": "Amongst non-matching record comparisons, 95.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 1201.8453768690874,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 1,201.85 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 10.231035583062827,
          "m_probability": 0.95,
          "m_probability_description": "Amongst matching record comparisons, 95.00% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "surname_l = surname_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.000790451099853488,
          "u_probability_description": "Amongst non-matching record comparisons, 0.08% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 239.91516203703725,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 239.92 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 7.906380524998534,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 6.946900114672094e-05,
          "u_probability_description": "Amongst non-matching record comparisons, 0.01% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 0.43618460230584577,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is  2.29 times less likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": -1.196989252393987,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.038210121536982354,
          "u_probability_description": "Amongst non-matching record comparisons, 3.82% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.01734430956349447,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  57.66 times less likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.8493937783316525,
          "m_probability": 0.01666666666666668,
          "m_probability_description": "Amongst matching record comparisons, 1.67% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9609299583620174,
          "u_probability_description": "Amongst non-matching record comparisons, 96.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4392.5096861589245,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,392.51 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.100829752263532,
          "m_probability": 0.6789115078905416,
          "m_probability_description": "Amongst matching record comparisons, 67.89% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 227.80086824722986,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 227.80 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.8316294355634435,
          "m_probability": 0.1397856696656179,
          "m_probability_description": "Amongst matching record comparisons, 13.98% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.1814422049238122,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.51 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.462418017454674,
          "m_probability": 0.18130282244364318,
          "m_probability_description": "Amongst matching record comparisons, 18.13% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 153.8672720379109,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 153.87 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.26554258914858,
          "m_probability": 0.8294000291540573,
          "m_probability_description": "Amongst matching record comparisons, 82.94% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.17152454996427302,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.83 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.5435130136675363,
          "m_probability": 0.17059997084581074,
          "m_probability_description": "Amongst matching record comparisons, 17.06% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 26.062190740951976,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 26.06 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.703886454244968,
          "m_probability": 0.9071799013778532,
          "m_probability_description": "Amongst matching record comparisons, 90.72% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.09616752424856101,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  10.40 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.3783064113274013,
          "m_probability": 0.09282009862226943,
          "m_probability_description": "Amongst matching record comparisons, 9.28% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.049087564684904876,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.047 or one in  21.4 records.This is equivalent to a starting match weight of -4.348.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "",
          "log2_bayes_factor": -4.348498595777272,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 48.897192303824504,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 48.90 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.611679722321693,
          "m_probability": 0.6383459736830944,
          "m_probability_description": "Amongst matching record comparisons, 63.83% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "first_name_l = first_name_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.01305485946343725,
          "u_probability_description": "Amongst non-matching record comparisons, 1.31% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 19.59741231689835,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 19.60 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 4.292591265348278,
          "m_probability": 0.008257959623343721,
          "m_probability_description": "Amongst matching record comparisons, 0.83% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00042138010313857066,
          "u_probability_description": "Amongst non-matching record comparisons, 0.04% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 5.837932905302266,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 5.84 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 2.5454576302057292,
          "m_probability": 0.2080647589259466,
          "m_probability_description": "Amongst matching record comparisons, 20.81% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.035640142204610314,
          "u_probability_description": "Amongst non-matching record comparisons, 3.56% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.1528381654500091,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.54 times less likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.7099232489600813,
          "m_probability": 0.14533130776655875,
          "m_probability_description": "Amongst matching record comparisons, 14.53% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9508836182288138,
          "u_probability_description": "Amongst non-matching record comparisons, 95.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 1080.5633578575753,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 1,080.56 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 10.07756795046165,
          "m_probability": 0.8541324946798985,
          "m_probability_description": "Amongst matching record comparisons, 85.41% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "surname_l = surname_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.000790451099853488,
          "u_probability_description": "Amongst non-matching record comparisons, 0.08% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 393.8236431301664,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 393.82 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 8.621405915546271,
          "m_probability": 0.027358535116215346,
          "m_probability_description": "Amongst matching record comparisons, 2.74% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 6.946900114672094e-05,
          "u_probability_description": "Amongst non-matching record comparisons, 0.01% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 2.414022904430979,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 2.41 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 1.271439364556988,
          "m_probability": 0.09224010857136684,
          "m_probability_description": "Amongst matching record comparisons, 9.22% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.038210121536982354,
          "u_probability_description": "Amongst non-matching record comparisons, 3.82% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.027336916081031653,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  36.58 times less likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.193005690267132,
          "m_probability": 0.02626886163149171,
          "m_probability_description": "Amongst matching record comparisons, 2.63% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9609299583620174,
          "u_probability_description": "Amongst non-matching record comparisons, 96.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4796.257521719333,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,796.26 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.22769340699496,
          "m_probability": 0.7413152523174679,
          "m_probability_description": "Amongst matching record comparisons, 74.13% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 228.05956548874704,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 228.06 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.8332668721114445,
          "m_probability": 0.13994441430704255,
          "m_probability_description": "Amongst matching record comparisons, 13.99% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.11883161889312746,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  8.42 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.073009333235782,
          "m_probability": 0.11874033337457472,
          "m_probability_description": "Amongst matching record comparisons, 11.87% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 163.89094439539855,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 163.89 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.356592331968195,
          "m_probability": 0.8834312343312218,
          "m_probability_description": "Amongst matching record comparisons, 88.34% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.117200518686695,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  8.53 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.092949140271683,
          "m_probability": 0.11656876566782257,
          "m_probability_description": "Amongst matching record comparisons, 11.66% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 26.254519756555755,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 26.25 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.714493901249683,
          "m_probability": 0.9138745426358261,
          "m_probability_description": "Amongst matching record comparisons, 91.39% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.08923145021803135,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  11.21 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 1,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.4863039025368985,
          "m_probability": 0.08612545736373918,
          "m_probability_description": "Amongst matching record comparisons, 8.61% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.058257456912884954,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.055 or one in  18.2 records.This is equivalent to a starting match weight of -4.101.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "",
          "log2_bayes_factor": -4.101413464253324,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 43.471569394723296,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 43.47 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.442000275093057,
          "m_probability": 0.5675152291031725,
          "m_probability_description": "Amongst matching record comparisons, 56.75% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "first_name_l = first_name_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.01305485946343725,
          "u_probability_description": "Amongst non-matching record comparisons, 1.31% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 17.89622875042037,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 17.90 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 4.161583697109365,
          "m_probability": 0.00754111471664359,
          "m_probability_description": "Amongst matching record comparisons, 0.75% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00042138010313857066,
          "u_probability_description": "Amongst non-matching record comparisons, 0.04% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 6.377889504151217,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 6.38 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 2.6730791033609544,
          "m_probability": 0.22730888889324097,
          "m_probability_description": "Amongst matching record comparisons, 22.73% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.035640142204610314,
          "u_probability_description": "Amongst non-matching record comparisons, 3.56% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.20784327703062858,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  4.81 times less likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.2664320120751236,
          "m_probability": 0.19763476728741783,
          "m_probability_description": "Amongst matching record comparisons, 19.76% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9508836182288138,
          "u_probability_description": "Amongst non-matching record comparisons, 95.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 1015.2815584242044,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 1,015.28 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.98766415651875,
          "m_probability": 0.8025304245173757,
          "m_probability_description": "Amongst matching record comparisons, 80.25% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "surname_l = surname_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.000790451099853488,
          "u_probability_description": "Amongst non-matching record comparisons, 0.08% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 396.4192119029276,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 396.42 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 8.630883072191754,
          "m_probability": 0.027538846686266688,
          "m_probability_description": "Amongst matching record comparisons, 2.75% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 6.946900114672094e-05,
          "u_probability_description": "Amongst non-matching record comparisons, 0.01% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 3.218459455452871,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 3.22 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 1.6863702946094656,
          "m_probability": 0.12297772695470426,
          "m_probability_description": "Amongst matching record comparisons, 12.30% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.038210121536982354,
          "u_probability_description": "Amongst non-matching record comparisons, 3.82% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.04886204393315619,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  20.47 times less likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.35514197597404,
          "m_probability": 0.04695300184217085,
          "m_probability_description": "Amongst matching record comparisons, 4.70% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9609299583620174,
          "u_probability_description": "Amongst non-matching record comparisons, 96.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4554.491009054013,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,554.49 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.153074118108885,
          "m_probability": 0.7039475333143067,
          "m_probability_description": "Amongst matching record comparisons, 70.39% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 236.34438544459627,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 236.34 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.884746782470233,
          "m_probability": 0.14502823648251637,
          "m_probability_description": "Amongst matching record comparisons, 14.50% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.1511403350247388,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.62 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.7260393690325353,
          "m_probability": 0.15102423020360212,
          "m_probability_description": "Amongst matching record comparisons, 15.10% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 159.14174316212763,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 159.14 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.3141684965564915,
          "m_probability": 0.8578313287167071,
          "m_probability_description": "Amongst matching record comparisons, 85.78% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.1429391648788649,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  7.00 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.8065268304131155,
          "m_probability": 0.14216867128430388,
          "m_probability_description": "Amongst matching record comparisons, 14.22% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 25.84810545408953,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.85 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.691986636466077,
          "m_probability": 0.8997279618477987,
          "m_probability_description": "Amongst matching record comparisons, 89.97% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.10388820744244502,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  9.63 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 2,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.2668961945418773,
          "m_probability": 0.10027203815265966,
          "m_probability_description": "Amongst matching record comparisons, 10.03% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.059782715566879904,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.056 or one in  17.7 records.This is equivalent to a starting match weight of -4.064.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "",
          "log2_bayes_factor": -4.064127758367784,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 42.735343717381475,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.74 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.417357820812287,
          "m_probability": 0.5579039063521012,
          "m_probability_description": "Amongst matching record comparisons, 55.79% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "first_name_l = first_name_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.01305485946343725,
          "u_probability_description": "Amongst non-matching record comparisons, 1.31% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 17.68183226607441,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 17.68 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 4.144195875511176,
          "m_probability": 0.007450772303957341,
          "m_probability_description": "Amongst matching record comparisons, 0.75% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00042138010313857066,
          "u_probability_description": "Amongst non-matching record comparisons, 0.04% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 6.369009438437449,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 6.37 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 2.6710690099416508,
          "m_probability": 0.22699240208841598,
          "m_probability_description": "Amongst matching record comparisons, 22.70% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.035640142204610314,
          "u_probability_description": "Amongst non-matching record comparisons, 3.56% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.21837890071471,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  4.58 times less likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.1950946218929737,
          "m_probability": 0.20765291925643434,
          "m_probability_description": "Amongst matching record comparisons, 20.77% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9508836182288138,
          "u_probability_description": "Amongst non-matching record comparisons, 95.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 1000.6747428406501,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 1,000.67 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.966757404546238,
          "m_probability": 0.7909844510739981,
          "m_probability_description": "Amongst matching record comparisons, 79.10% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "surname_l = surname_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.000790451099853488,
          "u_probability_description": "Amongst non-matching record comparisons, 0.08% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 392.6047733964612,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 392.60 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 8.61693390336724,
          "m_probability": 0.027273861453286877,
          "m_probability_description": "Amongst matching record comparisons, 2.73% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 6.946900114672094e-05,
          "u_probability_description": "Amongst non-matching record comparisons, 0.01% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 3.3308445827934863,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 3.33 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 1.7358880394328378,
          "m_probability": 0.1272719763293384,
          "m_probability_description": "Amongst matching record comparisons, 12.73% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.038210121536982354,
          "u_probability_description": "Amongst non-matching record comparisons, 3.82% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.05668437191545441,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  17.64 times less likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.1409051559806,
          "m_probability": 0.05446971114449472,
          "m_probability_description": "Amongst matching record comparisons, 5.45% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9609299583620174,
          "u_probability_description": "Amongst non-matching record comparisons, 96.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4493.944951833532,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,493.94 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.133766737127575,
          "m_probability": 0.6945894628850235,
          "m_probability_description": "Amongst matching record comparisons, 69.46% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 235.38654463613585,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 235.39 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.878888043959358,
          "m_probability": 0.144440475690058,
          "m_probability_description": "Amongst matching record comparisons, 14.44% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.16109381243074802,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.21 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.6340270133004244,
          "m_probability": 0.1609700614262565,
          "m_probability_description": "Amongst matching record comparisons, 16.10% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 157.6589769199383,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 157.66 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.300663507522989,
          "m_probability": 0.8498386844837121,
          "m_probability_description": "Amongst matching record comparisons, 84.98% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.15097512583634987,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.62 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.7276172194336814,
          "m_probability": 0.15016131551715894,
          "m_probability_description": "Amongst matching record comparisons, 15.02% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 25.776533349576276,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.78 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.687986345566368,
          "m_probability": 0.8972366603544166,
          "m_probability_description": "Amongst matching record comparisons, 89.72% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.1064693542015801,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  9.39 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 3,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.2314898654108246,
          "m_probability": 0.10276333964568997,
          "m_probability_description": "Amongst matching record comparisons, 10.28% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 0.06013846046847862,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.057 or one in  17.6 records.This is equivalent to a starting match weight of -4.056.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "",
          "log2_bayes_factor": -4.055568254069667,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 42.57918965897765,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.58 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.412076588451669,
          "m_probability": 0.5558653370649939,
          "m_probability_description": "Amongst matching record comparisons, 55.59% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "first_name_l = first_name_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.01305485946343725,
          "u_probability_description": "Amongst non-matching record comparisons, 1.31% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 17.63445713473013,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 17.63 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 4.140325258908541,
          "m_probability": 0.007430809366225286,
          "m_probability_description": "Amongst matching record comparisons, 0.74% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00042138010313857066,
          "u_probability_description": "Amongst non-matching record comparisons, 0.04% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 6.364203448564137,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 6.36 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 2.6699799559617055,
          "m_probability": 0.2268211159258972,
          "m_probability_description": "Amongst matching record comparisons, 22.68% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.035640142204610314,
          "u_probability_description": "Amongst non-matching record comparisons, 3.56% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.22072389682447663,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  4.53 times less likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.1796852624432685,
          "m_probability": 0.20988273764202173,
          "m_probability_description": "Amongst matching record comparisons, 20.99% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9508836182288138,
          "u_probability_description": "Amongst non-matching record comparisons, 95.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 996.9629081638653,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 996.96 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.961396020172025,
          "m_probability": 0.7880504272712593,
          "m_probability_description": "Amongst matching record comparisons, 78.81% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "surname_l = surname_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.000790451099853488,
          "u_probability_description": "Amongst non-matching record comparisons, 0.08% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 391.4377585053876,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 391.44 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 8.61263911636049,
          "m_probability": 0.027192790094480643,
          "m_probability_description": "Amongst matching record comparisons, 2.72% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 6.946900114672094e-05,
          "u_probability_description": "Amongst non-matching record comparisons, 0.01% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 3.348792035246015,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 3.35 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 1.743640785410052,
          "m_probability": 0.12795775066882872,
          "m_probability_description": "Amongst matching record comparisons, 12.80% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.038210121536982354,
          "u_probability_description": "Amongst non-matching record comparisons, 3.82% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.05910839959746899,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  16.92 times less likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.0804930306654015,
          "m_probability": 0.05679903196404137,
          "m_probability_description": "Amongst matching record comparisons, 5.68% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9609299583620174,
          "u_probability_description": "Amongst non-matching record comparisons, 96.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4478.216310309263,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,478.22 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.128708500720714,
          "m_probability": 0.6921584253922755,
          "m_probability_description": "Amongst matching record comparisons, 69.22% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 234.9307554121316,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 234.93 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.876091782546265,
          "m_probability": 0.1441607893025827,
          "m_probability_description": "Amongst matching record comparisons, 14.42% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.1638066202666821,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.10 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.6099344300152416,
          "m_probability": 0.16368078530446667,
          "m_probability_description": "Amongst matching record comparisons, 16.37% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 157.25209579829618,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 157.25 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.296935434718144,
          "m_probability": 0.8476454486533592,
          "m_probability_description": "Amongst matching record comparisons, 84.76% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.15318024806780273,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.53 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.70669781493369,
          "m_probability": 0.1523545513453577,
          "m_probability_description": "Amongst matching record comparisons, 15.24% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 25.76308302430687,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.76 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6872333435314175,
          "m_probability": 0.8967684777341342,
          "m_probability_description": "Amongst matching record comparisons, 89.68% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.10695442116596324,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  9.35 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 4,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "iteration": 4,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.2249319745875216,
          "m_probability": 0.10323152226580272,
          "m_probability_description": "Amongst matching record comparisons, 10.32% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 0.056726986814442375,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         }
        ]
       },
       "params": [
        {
         "bind": {
          "input": "range",
          "max": 4,
          "min": 0,
          "step": 1
         },
         "description": "Filter by the interation number",
         "name": "iteration_number",
         "value": 4
        }
       ],
       "resolve": {
        "axis": {
         "y": "independent"
        },
        "scale": {
         "y": "independent"
        }
       },
       "selection": {
        "zoom_selector": {
         "bind": "scales",
         "encodings": [
          "x"
         ],
         "type": "interval"
        }
       },
       "title": {
        "subtitle": "Training session blocked on l.dob = r.dob",
        "text": "Model parameters (components of final match weight)"
       },
       "transform": [
        {
         "filter": "(datum.iteration == iteration_number)"
        }
       ],
       "vconcat": [
        {
         "encoding": {
          "color": {
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             0,
             10
            ],
            "range": [
             "red",
             "orange",
             "green"
            ]
           },
           "title": "Match weight",
           "type": "quantitative"
          },
          "tooltip": [
           {
            "field": "comparison_name",
            "title": "Comparison name",
            "type": "nominal"
           },
           {
            "field": "probability_two_random_records_match",
            "format": ".4f",
            "title": "Probability two random records match",
            "type": "nominal"
           },
           {
            "field": "log2_bayes_factor",
            "format": ",.4f",
            "title": "Equivalent match weight",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor_description",
            "title": "Match weight description",
            "type": "nominal"
           }
          ],
          "x": {
           "axis": {
            "domain": false,
            "labels": false,
            "ticks": false,
            "title": ""
           },
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             10
            ]
           },
           "type": "quantitative"
          },
          "y": {
           "axis": {
            "title": "Prior (starting) match weight",
            "titleAlign": "right",
            "titleAngle": 0,
            "titleFontWeight": "normal"
           },
           "field": "label_for_charts",
           "sort": {
            "field": "comparison_vector_value",
            "order": "descending"
           },
           "type": "nominal"
          }
         },
         "height": 30,
         "mark": {
          "clip": true,
          "height": 20,
          "type": "bar"
         },
         "selection": {
          "zoom_selector": {
           "bind": "scales",
           "encodings": [
            "x"
           ],
           "type": "interval"
          }
         },
         "transform": [
          {
           "filter": "(datum.comparison_name == 'probability_two_random_records_match')"
          }
         ]
        },
        {
         "encoding": {
          "color": {
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             0,
             10
            ],
            "range": [
             "red",
             "orange",
             "green"
            ]
           },
           "title": "Match weight",
           "type": "quantitative"
          },
          "row": {
           "field": "comparison_name",
           "header": {
            "labelAlign": "left",
            "labelAnchor": "middle",
            "labelAngle": 0
           },
           "sort": {
            "field": "comparison_sort_order"
           },
           "type": "nominal"
          },
          "tooltip": [
           {
            "field": "comparison_name",
            "title": "Comparison name",
            "type": "nominal"
           },
           {
            "field": "label_for_charts",
            "title": "Label",
            "type": "ordinal"
           },
           {
            "field": "sql_condition",
            "title": "SQL condition",
            "type": "nominal"
           },
           {
            "field": "m_probability",
            "format": ".4f",
            "title": "M probability",
            "type": "quantitative"
           },
           {
            "field": "u_probability",
            "format": ".4f",
            "title": "U probability",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor",
            "format": ",.4f",
            "title": "Bayes factor = m/u",
            "type": "quantitative"
           },
           {
            "field": "log2_bayes_factor",
            "format": ",.4f",
            "title": "Match weight = log2(m/u)",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor_description",
            "title": "Match weight description",
            "type": "nominal"
           }
          ],
          "x": {
           "axis": {
            "title": "Comparison level match weight = log2(m/u)"
           },
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             10
            ]
           },
           "type": "quantitative"
          },
          "y": {
           "axis": {
            "title": null
           },
           "field": "label_for_charts",
           "sort": {
            "field": "comparison_vector_value",
            "order": "descending"
           },
           "type": "nominal"
          }
         },
         "mark": {
          "clip": true,
          "type": "bar"
         },
         "resolve": {
          "axis": {
           "y": "independent"
          },
          "scale": {
           "y": "independent"
          }
         },
         "selection": {
          "zoom_selector": {
           "bind": "scales",
           "encodings": [
            "x"
           ],
           "type": "interval"
          }
         },
         "transform": [
          {
           "filter": "(datum.comparison_name != 'probability_two_random_records_match')"
          }
         ]
        }
       ]
      },
      "image/png": "",
      "text/plain": [
       "<VegaLite 4 object>\n",
       "\n",
       "If you see this message, it means the renderer has not been properly enabled\n",
       "for the frontend that you are using. For more information, see\n",
       "https://altair-viz.github.io/user_guide/troubleshooting.html\n"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "blocking_rule = \"l.dob = r.dob\"\n",
    "training_session_dob = linker.estimate_parameters_using_expectation_maximisation(blocking_rule)\n",
    "training_session_dob.match_weights_interactive_history_chart()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The final match weights can be viewed in the match weights chart:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.vegalite.v4+json": {
       "$schema": "https://vega.github.io/schema/vega-lite/v5.2.json",
       "config": {
        "header": {
         "title": null
        },
        "mark": {
         "tooltip": null
        },
        "title": {
         "anchor": "middle"
        },
        "view": {
         "height": 60,
         "width": 400
        }
       },
       "data": {
        "values": [
         {
          "bayes_factor": 2.5796568427437232e-05,
          "bayes_factor_description": "The probability that two random records drawn at random match is 0.000 or one in  38,765.8 records.This is equivalent to a starting match weight of -15.242.",
          "comparison_name": "probability_two_random_records_match",
          "comparison_sort_order": -1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "",
          "log2_bayes_factor": -15.242461309640488,
          "m_probability": null,
          "m_probability_description": null,
          "max_comparison_vector_value": 0,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": null,
          "tf_adjustment_column": null,
          "tf_adjustment_weight": null,
          "u_probability": null,
          "u_probability_description": null
         },
         {
          "bayes_factor": 42.57918965897765,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.58 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.412076588451669,
          "m_probability": 0.5558653370649939,
          "m_probability_description": "Amongst matching record comparisons, 55.59% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "first_name_l = first_name_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.01305485946343725,
          "u_probability_description": "Amongst non-matching record comparisons, 1.31% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 17.63445713473013,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 17.63 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 4.140325258908541,
          "m_probability": 0.007430809366225286,
          "m_probability_description": "Amongst matching record comparisons, 0.74% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00042138010313857066,
          "u_probability_description": "Amongst non-matching record comparisons, 0.04% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 6.364203448564137,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 6.36 times more likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 2.6699799559617055,
          "m_probability": 0.2268211159258972,
          "m_probability_description": "Amongst matching record comparisons, 22.68% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "jaccard(first_name_l, first_name_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.035640142204610314,
          "u_probability_description": "Amongst non-matching record comparisons, 3.56% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.22072389682447663,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  4.53 times less likely to be a match",
          "comparison_name": "first_name",
          "comparison_sort_order": 0,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.1796852624432685,
          "m_probability": 0.20988273764202173,
          "m_probability_description": "Amongst matching record comparisons, 20.99% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9508836182288138,
          "u_probability_description": "Amongst non-matching record comparisons, 95.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 996.9629081638653,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 996.96 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.961396020172025,
          "m_probability": 0.7880504272712593,
          "m_probability_description": "Amongst matching record comparisons, 78.81% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "surname_l = surname_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.000790451099853488,
          "u_probability_description": "Amongst non-matching record comparisons, 0.08% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 391.4377585053876,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.9` then comparison is 391.44 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "jaccard >= 0.9",
          "log2_bayes_factor": 8.61263911636049,
          "m_probability": 0.027192790094480643,
          "m_probability_description": "Amongst matching record comparisons, 2.72% of records are in the jaccard >= 0.9 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.9",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 6.946900114672094e-05,
          "u_probability_description": "Amongst non-matching record comparisons, 0.01% of records are in the jaccard >= 0.9 comparison level"
         },
         {
          "bayes_factor": 3.348792035246015,
          "bayes_factor_description": "If comparison level is `jaccard >= 0.5` then comparison is 3.35 times more likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "jaccard >= 0.5",
          "log2_bayes_factor": 1.743640785410052,
          "m_probability": 0.12795775066882872,
          "m_probability_description": "Amongst matching record comparisons, 12.80% of records are in the jaccard >= 0.5 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "jaccard(surname_l, surname_r) >= 0.5",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.038210121536982354,
          "u_probability_description": "Amongst non-matching record comparisons, 3.82% of records are in the jaccard >= 0.5 comparison level"
         },
         {
          "bayes_factor": 0.05910839959746899,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  16.92 times less likely to be a match",
          "comparison_name": "surname",
          "comparison_sort_order": 1,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -4.0804930306654015,
          "m_probability": 0.05679903196404137,
          "m_probability_description": "Amongst matching record comparisons, 5.68% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9609299583620174,
          "u_probability_description": "Amongst non-matching record comparisons, 96.09% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 265.32189199903297,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 265.32 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 2,
          "comparison_vector_value": 3,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.051599908595803,
          "m_probability": 0.6103033862603122,
          "m_probability_description": "Amongst matching record comparisons, 61.03% of records are in the exact match comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "dob_l = dob_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00230023757806814,
          "u_probability_description": "Amongst non-matching record comparisons, 0.23% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 14.570450186831058,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is 14.57 times more likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 2,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": 3.8649735482866947,
          "m_probability": 0.3369199648381029,
          "m_probability_description": "Amongst matching record comparisons, 33.69% of records are in the levenshtein <= 1 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.023123510977211607,
          "u_probability_description": "Amongst non-matching record comparisons, 2.31% of records are in the levenshtein <= 1 comparison level"
         },
         {
          "bayes_factor": 0.4755960375095446,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is  2.10 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 2,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": -1.0721913997557366,
          "m_probability": 0.037848961589134034,
          "m_probability_description": "Amongst matching record comparisons, 3.78% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.07958216344133114,
          "u_probability_description": "Amongst non-matching record comparisons, 7.96% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.016679090412197704,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  59.96 times less likely to be a match",
          "comparison_name": "dob",
          "comparison_sort_order": 2,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -5.905815575667971,
          "m_probability": 0.014927687312190956,
          "m_probability_description": "Amongst matching record comparisons, 1.49% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 3,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.8949940880033891,
          "u_probability_description": "Amongst non-matching record comparisons, 89.50% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 4435.3629982340935,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,435.36 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 3,
          "comparison_vector_value": 2,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.11483646682574,
          "m_probability": 0.6855349666414086,
          "m_probability_description": "Amongst matching record comparisons, 68.55% of records are in the exact match comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.00015456118629170805,
          "u_probability_description": "Amongst non-matching record comparisons, 0.02% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 231.36581182968072,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 231.37 times more likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 3,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.854031887635192,
          "m_probability": 0.1419732294841003,
          "m_probability_description": "Amongst matching record comparisons, 14.20% of records are in the levenshtein <= 2 comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.0006136309784118558,
          "u_probability_description": "Amongst non-matching record comparisons, 0.06% of records are in the levenshtein <= 2 comparison level"
         },
         {
          "bayes_factor": 0.17262441259524716,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.79 times less likely to be a match",
          "comparison_name": "postcode_fake",
          "comparison_sort_order": 3,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.5342915895850107,
          "m_probability": 0.17249180387405494,
          "m_probability_description": "Amongst matching record comparisons, 17.25% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 2,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9992318078352964,
          "u_probability_description": "Amongst non-matching record comparisons, 99.92% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 155.55968391810353,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 155.56 times more likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 4,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 7.281324398248788,
          "m_probability": 0.8385227389037082,
          "m_probability_description": "Amongst matching record comparisons, 83.85% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "birth_place_l = birth_place_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.005390360264200329,
          "u_probability_description": "Amongst non-matching record comparisons, 0.54% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.16235239901603787,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.16 times less likely to be a match",
          "comparison_name": "birth_place",
          "comparison_sort_order": 4,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.622799391982059,
          "m_probability": 0.16147726109558422,
          "m_probability_description": "Amongst matching record comparisons, 16.15% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9946096397357996,
          "u_probability_description": "Amongst non-matching record comparisons, 99.46% of records are in the all other comparisons comparison level"
         },
         {
          "bayes_factor": 25.91263688262942,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.91 times more likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 5,
          "comparison_vector_value": 1,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6955839272300235,
          "m_probability": 0.9019741895559936,
          "m_probability_description": "Amongst matching record comparisons, 90.20% of records are in the exact match comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "occupation_l = occupation_r",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.03480827496026209,
          "u_probability_description": "Amongst non-matching record comparisons, 3.48% of records are in the exact match comparison level"
         },
         {
          "bayes_factor": 0.10156097270726212,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  9.85 times less likely to be a match",
          "comparison_name": "occupation",
          "comparison_sort_order": 5,
          "comparison_vector_value": 0,
          "has_tf_adjustments": false,
          "is_null_level": false,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -3.299581977211292,
          "m_probability": 0.09802581044403608,
          "m_probability_description": "Amongst matching record comparisons, 9.80% of records are in the all other comparisons comparison level",
          "max_comparison_vector_value": 1,
          "probability_two_random_records_match": 2.5795902981660818e-05,
          "sql_condition": "ELSE",
          "tf_adjustment_column": null,
          "tf_adjustment_weight": 1,
          "u_probability": 0.9651917250397379,
          "u_probability_description": "Amongst non-matching record comparisons, 96.52% of records are in the all other comparisons comparison level"
         }
        ]
       },
       "resolve": {
        "axis": {
         "y": "independent"
        },
        "scale": {
         "y": "independent"
        }
       },
       "selection": {
        "zoom_selector": {
         "bind": "scales",
         "encodings": [
          "x"
         ],
         "type": "interval"
        }
       },
       "title": {
        "subtitle": "Use mousewheel to zoom",
        "text": "Model parameters (components of final match weight)"
       },
       "vconcat": [
        {
         "encoding": {
          "color": {
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             0,
             10
            ],
            "range": [
             "red",
             "orange",
             "green"
            ]
           },
           "title": "Match weight",
           "type": "quantitative"
          },
          "tooltip": [
           {
            "field": "comparison_name",
            "title": "Comparison name",
            "type": "nominal"
           },
           {
            "field": "probability_two_random_records_match",
            "format": ".4f",
            "title": "Probability two random records match",
            "type": "nominal"
           },
           {
            "field": "log2_bayes_factor",
            "format": ",.4f",
            "title": "Equivalent match weight",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor_description",
            "title": "Match weight description",
            "type": "nominal"
           }
          ],
          "x": {
           "axis": {
            "domain": false,
            "labels": false,
            "ticks": false,
            "title": ""
           },
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             10
            ]
           },
           "type": "quantitative"
          },
          "y": {
           "axis": {
            "title": "Prior (starting) match weight",
            "titleAlign": "right",
            "titleAngle": 0,
            "titleFontWeight": "normal"
           },
           "field": "label_for_charts",
           "sort": {
            "field": "comparison_vector_value",
            "order": "descending"
           },
           "type": "nominal"
          }
         },
         "height": 30,
         "mark": {
          "clip": true,
          "height": 20,
          "type": "bar"
         },
         "selection": {
          "zoom_selector": {
           "bind": "scales",
           "encodings": [
            "x"
           ],
           "type": "interval"
          }
         },
         "transform": [
          {
           "filter": "(datum.comparison_name == 'probability_two_random_records_match')"
          }
         ]
        },
        {
         "encoding": {
          "color": {
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             0,
             10
            ],
            "range": [
             "red",
             "orange",
             "green"
            ]
           },
           "title": "Match weight",
           "type": "quantitative"
          },
          "row": {
           "field": "comparison_name",
           "header": {
            "labelAlign": "left",
            "labelAnchor": "middle",
            "labelAngle": 0
           },
           "sort": {
            "field": "comparison_sort_order"
           },
           "type": "nominal"
          },
          "tooltip": [
           {
            "field": "comparison_name",
            "title": "Comparison name",
            "type": "nominal"
           },
           {
            "field": "label_for_charts",
            "title": "Label",
            "type": "ordinal"
           },
           {
            "field": "sql_condition",
            "title": "SQL condition",
            "type": "nominal"
           },
           {
            "field": "m_probability",
            "format": ".4f",
            "title": "M probability",
            "type": "quantitative"
           },
           {
            "field": "u_probability",
            "format": ".4f",
            "title": "U probability",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor",
            "format": ",.4f",
            "title": "Bayes factor = m/u",
            "type": "quantitative"
           },
           {
            "field": "log2_bayes_factor",
            "format": ",.4f",
            "title": "Match weight = log2(m/u)",
            "type": "quantitative"
           },
           {
            "field": "bayes_factor_description",
            "title": "Match weight description",
            "type": "nominal"
           }
          ],
          "x": {
           "axis": {
            "title": "Comparison level match weight = log2(m/u)"
           },
           "field": "log2_bayes_factor",
           "scale": {
            "domain": [
             -10,
             10
            ]
           },
           "type": "quantitative"
          },
          "y": {
           "axis": {
            "title": null
           },
           "field": "label_for_charts",
           "sort": {
            "field": "comparison_vector_value",
            "order": "descending"
           },
           "type": "nominal"
          }
         },
         "mark": {
          "clip": true,
          "type": "bar"
         },
         "resolve": {
          "axis": {
           "y": "independent"
          },
          "scale": {
           "y": "independent"
          }
         },
         "selection": {
          "zoom_selector": {
           "bind": "scales",
           "encodings": [
            "x"
           ],
           "type": "interval"
          }
         },
         "transform": [
          {
           "filter": "(datum.comparison_name != 'probability_two_random_records_match')"
          }
         ]
        }
       ]
      },
      "image/png": "",
      "text/plain": [
       "<VegaLite 4 object>\n",
       "\n",
       "If you see this message, it means the renderer has not been properly enabled\n",
       "for the frontend that you are using. For more information, see\n",
       "https://altair-viz.github.io/user_guide/troubleshooting.html\n"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "linker.match_weights_chart()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>match_weight</th>\n",
       "      <th>match_probability</th>\n",
       "      <th>unique_id_l</th>\n",
       "      <th>unique_id_r</th>\n",
       "      <th>first_name_l</th>\n",
       "      <th>first_name_r</th>\n",
       "      <th>gamma_first_name</th>\n",
       "      <th>bf_first_name</th>\n",
       "      <th>surname_l</th>\n",
       "      <th>surname_r</th>\n",
       "      <th>...</th>\n",
       "      <th>bf_postcode_fake</th>\n",
       "      <th>birth_place_l</th>\n",
       "      <th>birth_place_r</th>\n",
       "      <th>gamma_birth_place</th>\n",
       "      <th>bf_birth_place</th>\n",
       "      <th>occupation_l</th>\n",
       "      <th>occupation_r</th>\n",
       "      <th>gamma_occupation</th>\n",
       "      <th>bf_occupation</th>\n",
       "      <th>match_key</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>16.545601</td>\n",
       "      <td>0.999990</td>\n",
       "      <td>Q2296770-1</td>\n",
       "      <td>Q2296770-14</td>\n",
       "      <td>thomas</td>\n",
       "      <td>thomas</td>\n",
       "      <td>3</td>\n",
       "      <td>42.57919</td>\n",
       "      <td>chudleigh</td>\n",
       "      <td>chudleigh</td>\n",
       "      <td>...</td>\n",
       "      <td>231.365812</td>\n",
       "      <td>devon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>politician</td>\n",
       "      <td>politician</td>\n",
       "      <td>1</td>\n",
       "      <td>25.912637</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.292304</td>\n",
       "      <td>0.830462</td>\n",
       "      <td>Q2296770-10</td>\n",
       "      <td>Q2296770-14</td>\n",
       "      <td>thomas</td>\n",
       "      <td>thomas</td>\n",
       "      <td>3</td>\n",
       "      <td>42.57919</td>\n",
       "      <td>chudleigh</td>\n",
       "      <td>chudleigh</td>\n",
       "      <td>...</td>\n",
       "      <td>0.172624</td>\n",
       "      <td>devon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>politician</td>\n",
       "      <td>politician</td>\n",
       "      <td>1</td>\n",
       "      <td>25.912637</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>22.370232</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>Q1443188-1</td>\n",
       "      <td>Q1443188-3</td>\n",
       "      <td>frank</td>\n",
       "      <td>frank</td>\n",
       "      <td>3</td>\n",
       "      <td>42.57919</td>\n",
       "      <td>brightman</td>\n",
       "      <td>brightman</td>\n",
       "      <td>...</td>\n",
       "      <td>4435.362998</td>\n",
       "      <td>bristol</td>\n",
       "      <td>bristol, city of</td>\n",
       "      <td>0</td>\n",
       "      <td>0.162352</td>\n",
       "      <td>liturgist</td>\n",
       "      <td>liturgist</td>\n",
       "      <td>1</td>\n",
       "      <td>25.912637</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>22.370232</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>Q1443188-2</td>\n",
       "      <td>Q1443188-3</td>\n",
       "      <td>frank</td>\n",
       "      <td>frank</td>\n",
       "      <td>3</td>\n",
       "      <td>42.57919</td>\n",
       "      <td>brightman</td>\n",
       "      <td>brightman</td>\n",
       "      <td>...</td>\n",
       "      <td>4435.362998</td>\n",
       "      <td>bristol</td>\n",
       "      <td>bristol, city of</td>\n",
       "      <td>0</td>\n",
       "      <td>0.162352</td>\n",
       "      <td>liturgist</td>\n",
       "      <td>liturgist</td>\n",
       "      <td>1</td>\n",
       "      <td>25.912637</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6.157277</td>\n",
       "      <td>0.986182</td>\n",
       "      <td>Q1443188-4</td>\n",
       "      <td>Q1443188-5</td>\n",
       "      <td>francis</td>\n",
       "      <td>francis</td>\n",
       "      <td>3</td>\n",
       "      <td>42.57919</td>\n",
       "      <td>brightman</td>\n",
       "      <td>brightman</td>\n",
       "      <td>...</td>\n",
       "      <td>0.172624</td>\n",
       "      <td>NaN</td>\n",
       "      <td>bristol, city of</td>\n",
       "      <td>-1</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>liturgist</td>\n",
       "      <td>liturgist</td>\n",
       "      <td>1</td>\n",
       "      <td>25.912637</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   match_weight  match_probability  unique_id_l  unique_id_r first_name_l  \\\n",
       "0     16.545601           0.999990   Q2296770-1  Q2296770-14       thomas   \n",
       "1      2.292304           0.830462  Q2296770-10  Q2296770-14       thomas   \n",
       "2     22.370232           1.000000   Q1443188-1   Q1443188-3        frank   \n",
       "3     22.370232           1.000000   Q1443188-2   Q1443188-3        frank   \n",
       "4      6.157277           0.986182   Q1443188-4   Q1443188-5      francis   \n",
       "\n",
       "  first_name_r  gamma_first_name  bf_first_name  surname_l  surname_r  ...  \\\n",
       "0       thomas                 3       42.57919  chudleigh  chudleigh  ...   \n",
       "1       thomas                 3       42.57919  chudleigh  chudleigh  ...   \n",
       "2        frank                 3       42.57919  brightman  brightman  ...   \n",
       "3        frank                 3       42.57919  brightman  brightman  ...   \n",
       "4      francis                 3       42.57919  brightman  brightman  ...   \n",
       "\n",
       "   bf_postcode_fake  birth_place_l     birth_place_r gamma_birth_place  \\\n",
       "0        231.365812          devon               NaN                -1   \n",
       "1          0.172624          devon               NaN                -1   \n",
       "2       4435.362998        bristol  bristol, city of                 0   \n",
       "3       4435.362998        bristol  bristol, city of                 0   \n",
       "4          0.172624            NaN  bristol, city of                -1   \n",
       "\n",
       "   bf_birth_place  occupation_l occupation_r gamma_occupation  bf_occupation  \\\n",
       "0        1.000000    politician   politician                1      25.912637   \n",
       "1        1.000000    politician   politician                1      25.912637   \n",
       "2        0.162352     liturgist    liturgist                1      25.912637   \n",
       "3        0.162352     liturgist    liturgist                1      25.912637   \n",
       "4        1.000000     liturgist    liturgist                1      25.912637   \n",
       "\n",
       "   match_key  \n",
       "0          0  \n",
       "1          0  \n",
       "2          0  \n",
       "3          0  \n",
       "4          0  \n",
       "\n",
       "[5 rows x 29 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_predict = linker.predict()\n",
    "df_e = df_predict.as_pandas_dataframe(limit=5)\n",
    "df_e"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can also view rows in this dataset as a waterfall chart as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.vegalite.v4+json": {
       "$schema": "https://vega.github.io/schema/vega-lite/v5.2.0.json",
       "config": {
        "view": {
         "continuousHeight": 300,
         "continuousWidth": 400
        }
       },
       "data": {
        "values": [
         {
          "bar_sort_order": 0,
          "bayes_factor": 2.5796568427437232e-05,
          "bayes_factor_description": null,
          "column_name": "Prior",
          "comparison_vector_value": null,
          "label_for_charts": "Starting match weight (prior)",
          "log2_bayes_factor": -15.242461309640488,
          "m_probability": null,
          "record_number": 0,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 1,
          "bayes_factor": 42.57918965897765,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.58 times more likely to be a match",
          "column_name": "first_name",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.412076588451669,
          "m_probability": 0.5558653370649939,
          "record_number": 0,
          "sql_condition": "first_name_l = first_name_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.01305485946343725,
          "value_l": "thomas",
          "value_r": "thomas"
         },
         {
          "bar_sort_order": 2,
          "bayes_factor": 996.9629081638653,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 996.96 times more likely to be a match",
          "column_name": "surname",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.961396020172025,
          "m_probability": 0.7880504272712593,
          "record_number": 0,
          "sql_condition": "surname_l = surname_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.000790451099853488,
          "value_l": "chudleigh",
          "value_r": "chudleigh"
         },
         {
          "bar_sort_order": 3,
          "bayes_factor": 14.570450186831058,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is 14.57 times more likely to be a match",
          "column_name": "dob",
          "comparison_vector_value": 2,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": 3.8649735482866947,
          "m_probability": 0.3369199648381029,
          "record_number": 0,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "term_frequency_adjustment": false,
          "u_probability": 0.023123510977211607,
          "value_l": "1630-08-01",
          "value_r": "1638-08-01"
         },
         {
          "bar_sort_order": 4,
          "bayes_factor": 231.36581182968072,
          "bayes_factor_description": "If comparison level is `levenshtein <= 2` then comparison is 231.37 times more likely to be a match",
          "column_name": "postcode_fake",
          "comparison_vector_value": 1,
          "label_for_charts": "levenshtein <= 2",
          "log2_bayes_factor": 7.854031887635192,
          "m_probability": 0.1419732294841003,
          "record_number": 0,
          "sql_condition": "levenshtein(postcode_fake_l, postcode_fake_r) <= 2",
          "term_frequency_adjustment": false,
          "u_probability": 0.0006136309784118558,
          "value_l": "tq13 8df",
          "value_r": "tq1w 8df"
         },
         {
          "bar_sort_order": 5,
          "bayes_factor": 1,
          "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match",
          "column_name": "birth_place",
          "comparison_vector_value": -1,
          "label_for_charts": "Null",
          "log2_bayes_factor": 0,
          "m_probability": null,
          "record_number": 0,
          "sql_condition": "birth_place_l IS NULL OR birth_place_r IS NULL",
          "term_frequency_adjustment": false,
          "u_probability": null,
          "value_l": "devon",
          "value_r": "nan"
         },
         {
          "bar_sort_order": 6,
          "bayes_factor": 25.91263688262942,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.91 times more likely to be a match",
          "column_name": "occupation",
          "comparison_vector_value": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6955839272300235,
          "m_probability": 0.9019741895559936,
          "record_number": 0,
          "sql_condition": "occupation_l = occupation_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.03480827496026209,
          "value_l": "politician",
          "value_r": "politician"
         },
         {
          "bar_sort_order": 7,
          "bayes_factor": 95658.17601033658,
          "bayes_factor_description": null,
          "column_name": "Final score",
          "comparison_vector_value": null,
          "label_for_charts": "Final score",
          "log2_bayes_factor": 16.545600662135115,
          "m_probability": null,
          "record_number": 0,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 0,
          "bayes_factor": 2.5796568427437232e-05,
          "bayes_factor_description": null,
          "column_name": "Prior",
          "comparison_vector_value": null,
          "label_for_charts": "Starting match weight (prior)",
          "log2_bayes_factor": -15.242461309640488,
          "m_probability": null,
          "record_number": 1,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 1,
          "bayes_factor": 42.57918965897765,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.58 times more likely to be a match",
          "column_name": "first_name",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.412076588451669,
          "m_probability": 0.5558653370649939,
          "record_number": 1,
          "sql_condition": "first_name_l = first_name_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.01305485946343725,
          "value_l": "thomas",
          "value_r": "thomas"
         },
         {
          "bar_sort_order": 2,
          "bayes_factor": 996.9629081638653,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 996.96 times more likely to be a match",
          "column_name": "surname",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.961396020172025,
          "m_probability": 0.7880504272712593,
          "record_number": 1,
          "sql_condition": "surname_l = surname_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.000790451099853488,
          "value_l": "chudleigh",
          "value_r": "chudleigh"
         },
         {
          "bar_sort_order": 3,
          "bayes_factor": 1,
          "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match",
          "column_name": "dob",
          "comparison_vector_value": -1,
          "label_for_charts": "Null",
          "log2_bayes_factor": 0,
          "m_probability": null,
          "record_number": 1,
          "sql_condition": "dob_l IS NULL OR dob_r IS NULL",
          "term_frequency_adjustment": false,
          "u_probability": null,
          "value_l": "nan",
          "value_r": "1638-08-01"
         },
         {
          "bar_sort_order": 4,
          "bayes_factor": 0.17262441259524716,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.79 times less likely to be a match",
          "column_name": "postcode_fake",
          "comparison_vector_value": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.5342915895850107,
          "m_probability": 0.17249180387405494,
          "record_number": 1,
          "sql_condition": "ELSE",
          "term_frequency_adjustment": false,
          "u_probability": 0.9992318078352964,
          "value_l": "tq13 8jr",
          "value_r": "tq1w 8df"
         },
         {
          "bar_sort_order": 5,
          "bayes_factor": 1,
          "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match",
          "column_name": "birth_place",
          "comparison_vector_value": -1,
          "label_for_charts": "Null",
          "log2_bayes_factor": 0,
          "m_probability": null,
          "record_number": 1,
          "sql_condition": "birth_place_l IS NULL OR birth_place_r IS NULL",
          "term_frequency_adjustment": false,
          "u_probability": null,
          "value_l": "devon",
          "value_r": "nan"
         },
         {
          "bar_sort_order": 6,
          "bayes_factor": 25.91263688262942,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.91 times more likely to be a match",
          "column_name": "occupation",
          "comparison_vector_value": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6955839272300235,
          "m_probability": 0.9019741895559936,
          "record_number": 1,
          "sql_condition": "occupation_l = occupation_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.03480827496026209,
          "value_l": "politician",
          "value_r": "politician"
         },
         {
          "bar_sort_order": 7,
          "bayes_factor": 4.898376397285708,
          "bayes_factor_description": null,
          "column_name": "Final score",
          "comparison_vector_value": null,
          "label_for_charts": "Final score",
          "log2_bayes_factor": 2.2923036366282177,
          "m_probability": null,
          "record_number": 1,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 0,
          "bayes_factor": 2.5796568427437232e-05,
          "bayes_factor_description": null,
          "column_name": "Prior",
          "comparison_vector_value": null,
          "label_for_charts": "Starting match weight (prior)",
          "log2_bayes_factor": -15.242461309640488,
          "m_probability": null,
          "record_number": 2,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 1,
          "bayes_factor": 42.57918965897765,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.58 times more likely to be a match",
          "column_name": "first_name",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.412076588451669,
          "m_probability": 0.5558653370649939,
          "record_number": 2,
          "sql_condition": "first_name_l = first_name_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.01305485946343725,
          "value_l": "frank",
          "value_r": "frank"
         },
         {
          "bar_sort_order": 2,
          "bayes_factor": 996.9629081638653,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 996.96 times more likely to be a match",
          "column_name": "surname",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.961396020172025,
          "m_probability": 0.7880504272712593,
          "record_number": 2,
          "sql_condition": "surname_l = surname_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.000790451099853488,
          "value_l": "brightman",
          "value_r": "brightman"
         },
         {
          "bar_sort_order": 3,
          "bayes_factor": 265.32189199903297,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 265.32 times more likely to be a match",
          "column_name": "dob",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.051599908595803,
          "m_probability": 0.6103033862603122,
          "record_number": 2,
          "sql_condition": "dob_l = dob_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.00230023757806814,
          "value_l": "1856-06-18",
          "value_r": "1856-06-18"
         },
         {
          "bar_sort_order": 4,
          "bayes_factor": 4435.3629982340935,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,435.36 times more likely to be a match",
          "column_name": "postcode_fake",
          "comparison_vector_value": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.11483646682574,
          "m_probability": 0.6855349666414086,
          "record_number": 2,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.00015456118629170805,
          "value_l": "bs2 0el",
          "value_r": "bs2 0el"
         },
         {
          "bar_sort_order": 5,
          "bayes_factor": 0.16235239901603787,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.16 times less likely to be a match",
          "column_name": "birth_place",
          "comparison_vector_value": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.622799391982059,
          "m_probability": 0.16147726109558422,
          "record_number": 2,
          "sql_condition": "ELSE",
          "term_frequency_adjustment": false,
          "u_probability": 0.9946096397357996,
          "value_l": "bristol",
          "value_r": "bristol, city of"
         },
         {
          "bar_sort_order": 6,
          "bayes_factor": 25.91263688262942,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.91 times more likely to be a match",
          "column_name": "occupation",
          "comparison_vector_value": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6955839272300235,
          "m_probability": 0.9019741895559936,
          "record_number": 2,
          "sql_condition": "occupation_l = occupation_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.03480827496026209,
          "value_l": "liturgist",
          "value_r": "liturgist"
         },
         {
          "bar_sort_order": 7,
          "bayes_factor": 5421393.17959883,
          "bayes_factor_description": null,
          "column_name": "Final score",
          "comparison_vector_value": null,
          "label_for_charts": "Final score",
          "log2_bayes_factor": 22.370232209652713,
          "m_probability": null,
          "record_number": 2,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 0,
          "bayes_factor": 2.5796568427437232e-05,
          "bayes_factor_description": null,
          "column_name": "Prior",
          "comparison_vector_value": null,
          "label_for_charts": "Starting match weight (prior)",
          "log2_bayes_factor": -15.242461309640488,
          "m_probability": null,
          "record_number": 3,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 1,
          "bayes_factor": 42.57918965897765,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.58 times more likely to be a match",
          "column_name": "first_name",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.412076588451669,
          "m_probability": 0.5558653370649939,
          "record_number": 3,
          "sql_condition": "first_name_l = first_name_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.01305485946343725,
          "value_l": "frank",
          "value_r": "frank"
         },
         {
          "bar_sort_order": 2,
          "bayes_factor": 996.9629081638653,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 996.96 times more likely to be a match",
          "column_name": "surname",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.961396020172025,
          "m_probability": 0.7880504272712593,
          "record_number": 3,
          "sql_condition": "surname_l = surname_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.000790451099853488,
          "value_l": "brightman",
          "value_r": "brightman"
         },
         {
          "bar_sort_order": 3,
          "bayes_factor": 265.32189199903297,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 265.32 times more likely to be a match",
          "column_name": "dob",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 8.051599908595803,
          "m_probability": 0.6103033862603122,
          "record_number": 3,
          "sql_condition": "dob_l = dob_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.00230023757806814,
          "value_l": "1856-06-18",
          "value_r": "1856-06-18"
         },
         {
          "bar_sort_order": 4,
          "bayes_factor": 4435.3629982340935,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 4,435.36 times more likely to be a match",
          "column_name": "postcode_fake",
          "comparison_vector_value": 2,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 12.11483646682574,
          "m_probability": 0.6855349666414086,
          "record_number": 3,
          "sql_condition": "postcode_fake_l = postcode_fake_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.00015456118629170805,
          "value_l": "bs2 0el",
          "value_r": "bs2 0el"
         },
         {
          "bar_sort_order": 5,
          "bayes_factor": 0.16235239901603787,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  6.16 times less likely to be a match",
          "column_name": "birth_place",
          "comparison_vector_value": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.622799391982059,
          "m_probability": 0.16147726109558422,
          "record_number": 3,
          "sql_condition": "ELSE",
          "term_frequency_adjustment": false,
          "u_probability": 0.9946096397357996,
          "value_l": "bristol",
          "value_r": "bristol, city of"
         },
         {
          "bar_sort_order": 6,
          "bayes_factor": 25.91263688262942,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.91 times more likely to be a match",
          "column_name": "occupation",
          "comparison_vector_value": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6955839272300235,
          "m_probability": 0.9019741895559936,
          "record_number": 3,
          "sql_condition": "occupation_l = occupation_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.03480827496026209,
          "value_l": "liturgist",
          "value_r": "liturgist"
         },
         {
          "bar_sort_order": 7,
          "bayes_factor": 5421393.17959883,
          "bayes_factor_description": null,
          "column_name": "Final score",
          "comparison_vector_value": null,
          "label_for_charts": "Final score",
          "log2_bayes_factor": 22.370232209652713,
          "m_probability": null,
          "record_number": 3,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 0,
          "bayes_factor": 2.5796568427437232e-05,
          "bayes_factor_description": null,
          "column_name": "Prior",
          "comparison_vector_value": null,
          "label_for_charts": "Starting match weight (prior)",
          "log2_bayes_factor": -15.242461309640488,
          "m_probability": null,
          "record_number": 4,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         },
         {
          "bar_sort_order": 1,
          "bayes_factor": 42.57918965897765,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 42.58 times more likely to be a match",
          "column_name": "first_name",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 5.412076588451669,
          "m_probability": 0.5558653370649939,
          "record_number": 4,
          "sql_condition": "first_name_l = first_name_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.01305485946343725,
          "value_l": "francis",
          "value_r": "francis"
         },
         {
          "bar_sort_order": 2,
          "bayes_factor": 996.9629081638653,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 996.96 times more likely to be a match",
          "column_name": "surname",
          "comparison_vector_value": 3,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 9.961396020172025,
          "m_probability": 0.7880504272712593,
          "record_number": 4,
          "sql_condition": "surname_l = surname_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.000790451099853488,
          "value_l": "brightman",
          "value_r": "brightman"
         },
         {
          "bar_sort_order": 3,
          "bayes_factor": 14.570450186831058,
          "bayes_factor_description": "If comparison level is `levenshtein <= 1` then comparison is 14.57 times more likely to be a match",
          "column_name": "dob",
          "comparison_vector_value": 2,
          "label_for_charts": "levenshtein <= 1",
          "log2_bayes_factor": 3.8649735482866947,
          "m_probability": 0.3369199648381029,
          "record_number": 4,
          "sql_condition": "levenshtein(dob_l, dob_r) <= 1",
          "term_frequency_adjustment": false,
          "u_probability": 0.023123510977211607,
          "value_l": "1856-06-18",
          "value_r": "1856-86-18"
         },
         {
          "bar_sort_order": 4,
          "bayes_factor": 0.17262441259524716,
          "bayes_factor_description": "If comparison level is `all other comparisons` then comparison is  5.79 times less likely to be a match",
          "column_name": "postcode_fake",
          "comparison_vector_value": 0,
          "label_for_charts": "All other comparisons",
          "log2_bayes_factor": -2.5342915895850107,
          "m_probability": 0.17249180387405494,
          "record_number": 4,
          "sql_condition": "ELSE",
          "term_frequency_adjustment": false,
          "u_probability": 0.9992318078352964,
          "value_l": "bs2 0el",
          "value_r": "cf83 4jg"
         },
         {
          "bar_sort_order": 5,
          "bayes_factor": 1,
          "bayes_factor_description": "If comparison level is `null` then comparison is 1.00 times more likely to be a match",
          "column_name": "birth_place",
          "comparison_vector_value": -1,
          "label_for_charts": "Null",
          "log2_bayes_factor": 0,
          "m_probability": null,
          "record_number": 4,
          "sql_condition": "birth_place_l IS NULL OR birth_place_r IS NULL",
          "term_frequency_adjustment": false,
          "u_probability": null,
          "value_l": "nan",
          "value_r": "bristol, city of"
         },
         {
          "bar_sort_order": 6,
          "bayes_factor": 25.91263688262942,
          "bayes_factor_description": "If comparison level is `exact match` then comparison is 25.91 times more likely to be a match",
          "column_name": "occupation",
          "comparison_vector_value": 1,
          "label_for_charts": "Exact match",
          "log2_bayes_factor": 4.6955839272300235,
          "m_probability": 0.9019741895559936,
          "record_number": 4,
          "sql_condition": "occupation_l = occupation_r",
          "term_frequency_adjustment": false,
          "u_probability": 0.03480827496026209,
          "value_l": "liturgist",
          "value_r": "liturgist"
         },
         {
          "bar_sort_order": 7,
          "bayes_factor": 71.37154929300036,
          "bayes_factor_description": null,
          "column_name": "Final score",
          "comparison_vector_value": null,
          "label_for_charts": "Final score",
          "log2_bayes_factor": 6.1572771849149115,
          "m_probability": null,
          "record_number": 4,
          "sql_condition": null,
          "term_frequency_adjustment": null,
          "u_probability": null,
          "value_l": "",
          "value_r": ""
         }
        ]
       },
       "height": 450,
       "layer": [
        {
         "layer": [
          {
           "encoding": {
            "color": {
             "value": "black"
            },
            "size": {
             "value": 0.5
            },
            "y": {
             "field": "zero",
             "type": "quantitative"
            }
           },
           "mark": "rule"
          },
          {
           "encoding": {
            "color": {
             "condition": {
              "test": "(datum.log2_bayes_factor < 0)",
              "value": "red"
             },
             "value": "green"
            },
            "opacity": {
             "condition": {
              "test": "datum.column_name == 'Prior match weight' || datum.column_name == 'Final score'",
              "value": 1
             },
             "value": 0.5
            },
            "tooltip": [
             {
              "field": "column_name",
              "title": "Comparison column",
              "type": "nominal"
             },
             {
              "field": "value_l",
              "title": "Value (L)",
              "type": "nominal"
             },
             {
              "field": "value_r",
              "title": "Value (R)",
              "type": "nominal"
             },
             {
              "field": "label_for_charts",
              "title": "Label",
              "type": "ordinal"
             },
             {
              "field": "sql_condition",
              "title": "SQL condition",
              "type": "nominal"
             },
             {
              "field": "comparison_vector_value",
              "title": "Comparison vector value",
              "type": "nominal"
             },
             {
              "field": "bayes_factor",
              "format": ",.4f",
              "title": "Bayes factor = m/u",
              "type": "quantitative"
             },
             {
              "field": "log2_bayes_factor",
              "format": ",.4f",
              "title": "Match weight = log2(m/u)",
              "type": "quantitative"
             },
             {
              "field": "prob",
              "format": ".4f",
              "title": "Adjusted match score",
              "type": "quantitative"
             },
             {
              "field": "bayes_factor_description",
              "title": "Match weight description",
              "type": "nominal"
             }
            ],
            "x": {
             "axis": {
              "grid": true,
              "labelAlign": "center",
              "labelAngle": -20,
              "labelExpr": "datum.value == 'Prior' || datum.value == 'Final score' ? '' : datum.value",
              "labelPadding": 10,
              "tickBand": "extent",
              "title": "Column"
             },
             "field": "column_name",
             "sort": {
              "field": "bar_sort_order",
              "order": "ascending"
             },
             "type": "nominal"
            },
            "y": {
             "axis": {
              "grid": false,
              "orient": "left",
              "title": "log2(Bayes factor)"
             },
             "field": "previous_sum",
             "type": "quantitative"
            },
            "y2": {
             "field": "sum"
            }
           },
           "mark": {
            "type": "bar",
            "width": 60
           }
          },
          {
           "encoding": {
            "color": {
             "value": "white"
            },
            "text": {
             "condition": {
              "field": "log2_bayes_factor",
              "format": ".2f",
              "test": "abs(datum.log2_bayes_factor) > 1",
              "type": "nominal"
             },
             "value": ""
            },
            "x": {
             "axis": {
              "labelAngle": 0,
              "title": "Column"
             },
             "field": "column_name",
             "sort": {
              "field": "bar_sort_order",
              "order": "ascending"
             },
             "type": "nominal"
            },
            "y": {
             "axis": {
              "orient": "left"
             },
             "field": "center",
             "type": "quantitative"
            }
           },
           "mark": {
            "fontWeight": "bold",
            "type": "text"
           }
          },
          {
           "encoding": {
            "color": {
             "value": "black"
            },
            "text": {
             "field": "column_name",
             "type": "nominal"
            },
            "x": {
             "axis": {
              "labelAngle": 0,
              "title": "Column"
             },
             "field": "column_name",
             "sort": {
              "field": "bar_sort_order",
              "order": "ascending"
             },
             "type": "nominal"
            },
            "y": {
             "field": "sum_top",
             "type": "quantitative"
            }
           },
           "mark": {
            "baseline": "bottom",
            "dy": -25,
            "fontWeight": "bold",
            "type": "text"
           }
          },
          {
           "encoding": {
            "color": {
             "value": "grey"
            },
            "text": {
             "field": "value_l",
             "type": "nominal"
            },
            "x": {
             "axis": {
              "labelAngle": 0,
              "title": "Column"
             },
             "field": "column_name",
             "sort": {
              "field": "bar_sort_order",
              "order": "ascending"
             },
             "type": "nominal"
            },
            "y": {
             "field": "sum_top",
             "type": "quantitative"
            }
           },
           "mark": {
            "baseline": "bottom",
            "dy": -13,
            "fontSize": 8,
            "type": "text"
           }
          },
          {
           "encoding": {
            "color": {
             "value": "grey"
            },
            "text": {
             "field": "value_r",
             "type": "nominal"
            },
            "x": {
             "axis": {
              "labelAngle": 0,
              "title": "Column"
             },
             "field": "column_name",
             "sort": {
              "field": "bar_sort_order",
              "order": "ascending"
             },
             "type": "nominal"
            },
            "y": {
             "field": "sum_top",
             "type": "quantitative"
            }
           },
           "mark": {
            "baseline": "bottom",
            "dy": -5,
            "fontSize": 8,
            "type": "text"
           }
          }
         ]
        },
        {
         "encoding": {
          "x": {
           "axis": {
            "labelAngle": 0,
            "title": "Column"
           },
           "field": "column_name",
           "sort": {
            "field": "bar_sort_order",
            "order": "ascending"
           },
           "type": "nominal"
          },
          "x2": {
           "field": "lead"
          },
          "y": {
           "axis": {
            "labelExpr": "format(1 / (1 + pow(2, -1*datum.value)), '.2r')",
            "orient": "right",
            "title": "Probability"
           },
           "field": "sum",
           "scale": {
            "zero": false
           },
           "type": "quantitative"
          }
         },
         "mark": {
          "color": "black",
          "strokeWidth": 2,
          "type": "rule",
          "x2Offset": 30,
          "xOffset": -30
         }
        }
       ],
       "params": [
        {
         "bind": {
          "input": "range",
          "max": 4,
          "min": 0,
          "step": 1
         },
         "description": "Filter by the interation number",
         "name": "record_number",
         "value": 0
        }
       ],
       "resolve": {
        "axis": {
         "y": "independent"
        }
       },
       "title": {
        "subtitle": "How each comparison contributes to the final match score",
        "text": "Match weights waterfall chart"
       },
       "transform": [
        {
         "filter": "(datum.record_number == record_number)"
        },
        {
         "frame": [
          null,
          0
         ],
         "window": [
          {
           "as": "sum",
           "field": "log2_bayes_factor",
           "op": "sum"
          },
          {
           "as": "lead",
           "field": "column_name",
           "op": "lead"
          }
         ]
        },
        {
         "as": "sum",
         "calculate": "datum.column_name === \"Final score\" ? datum.sum - datum.log2_bayes_factor : datum.sum"
        },
        {
         "as": "lead",
         "calculate": "datum.lead === null ? datum.column_name : datum.lead"
        },
        {
         "as": "previous_sum",
         "calculate": "datum.column_name === \"Final score\" || datum.column_name === \"Prior match weight\" ? 0 : datum.sum - datum.log2_bayes_factor"
        },
        {
         "as": "top_label",
         "calculate": "datum.sum > datum.previous_sum ? datum.column_name : \"\""
        },
        {
         "as": "bottom_label",
         "calculate": "datum.sum < datum.previous_sum ? datum.column_name : \"\""
        },
        {
         "as": "sum_top",
         "calculate": "datum.sum > datum.previous_sum ? datum.sum : datum.previous_sum"
        },
        {
         "as": "sum_bottom",
         "calculate": "datum.sum < datum.previous_sum ? datum.sum : datum.previous_sum"
        },
        {
         "as": "center",
         "calculate": "(datum.sum + datum.previous_sum) / 2"
        },
        {
         "as": "text_log2_bayes_factor",
         "calculate": "(datum.log2_bayes_factor > 0 ? \"+\" : \"\") + datum.log2_bayes_factor"
        },
        {
         "as": "dy",
         "calculate": "datum.sum < datum.previous_sum ? 4 : -4"
        },
        {
         "as": "baseline",
         "calculate": "datum.sum < datum.previous_sum ? \"top\" : \"bottom\""
        },
        {
         "as": "prob",
         "calculate": "1. / (1 + pow(2, -1.*datum.sum))"
        },
        {
         "as": "zero",
         "calculate": "0*datum.sum"
        }
       ],
       "width": {
        "step": 75
       }
      },
      "image/png": "",
      "text/plain": [
       "<VegaLite 4 object>\n",
       "\n",
       "If you see this message, it means the renderer has not been properly enabled\n",
       "for the frontend that you are using. For more information, see\n",
       "https://altair-viz.github.io/user_guide/troubleshooting.html\n"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from splink.charts import waterfall_chart\n",
    "records_to_plot = df_e.to_dict(orient=\"records\")\n",
    "linker.waterfall_chart(records_to_plot, filter_nulls=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Completed iteration 1, root rows count 669\n",
      "Completed iteration 2, root rows count 147\n",
      "Completed iteration 3, root rows count 43\n",
      "Completed iteration 4, root rows count 11\n",
      "Completed iteration 5, root rows count 1\n",
      "Completed iteration 6, root rows count 0\n"
     ]
    }
   ],
   "source": [
    "clusters = linker.cluster_pairwise_predictions_at_threshold(df_predict, threshold_match_probability=0.95)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.cluster_studio_dashboard(df_predict, clusters, \"50k_cluster.html\", sampling_method='by_cluster_size', overwrite=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"100%\"\n",
       "            height=\"1200\"\n",
       "            src=\"./50k_cluster.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "            \n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x7fb6c2063430>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from IPython.display import IFrame\n",
    "\n",
    "IFrame(\n",
    "    src=\"./50k_cluster.html\", width=\"100%\", height=1200\n",
    ")  "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "vscode": {
   "interpreter": {
    "hash": "cc173ace240fa2ad02472fa75d75ddd885c067b637b7c50cecea7593995ac3de"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}