{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Historical people: Quick and dirty\n", "\n", "This example shows how to get some initial record linkage results as quickly as possible. \n", "\n", "There are many ways to improve the accuracy of this model. But this may be a good place to start if you just want to give Splink a try and see what it's capable of." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uncorrupted_recordclusterfull_namedobbirth_placepostcode_fakelatlnggenderoccupationunique_id
0TrueQ2296770thomas clifford, 1st baron clifford of chudleigh1630-08-01DevonTQ13 8DF50.692449-3.813964malepoliticianQ2296770-1
1FalseQ2296770thomas of chudleigh1630-08-01DevonTQ13 8DF50.692449-3.813964malepoliticianQ2296770-2
2FalseQ2296770tom 1st baron clifford of chudleigh1630-08-01DevonTQ13 8DF50.692449-3.813964malepoliticianQ2296770-3
3FalseQ2296770thomas 1st chudleigh1630-08-01DevonTQ13 8HU50.687638-3.895877NonepoliticianQ2296770-4
4FalseQ2296770thomas clifford, 1st baron chudleigh1630-08-01DevonTQ13 8DF50.692449-3.813964NonepoliticianQ2296770-5
\n", "
" ], "text/plain": [ " uncorrupted_record cluster \\\n", "0 True Q2296770 \n", "1 False Q2296770 \n", "2 False Q2296770 \n", "3 False Q2296770 \n", "4 False Q2296770 \n", "\n", " full_name dob birth_place \\\n", "0 thomas clifford, 1st baron clifford of chudleigh 1630-08-01 Devon \n", "1 thomas of chudleigh 1630-08-01 Devon \n", "2 tom 1st baron clifford of chudleigh 1630-08-01 Devon \n", "3 thomas 1st chudleigh 1630-08-01 Devon \n", "4 thomas clifford, 1st baron chudleigh 1630-08-01 Devon \n", "\n", " postcode_fake lat lng gender occupation unique_id \n", "0 TQ13 8DF 50.692449 -3.813964 male politician Q2296770-1 \n", "1 TQ13 8DF 50.692449 -3.813964 male politician Q2296770-2 \n", "2 TQ13 8DF 50.692449 -3.813964 male politician Q2296770-3 \n", "3 TQ13 8HU 50.687638 -3.895877 None politician Q2296770-4 \n", "4 TQ13 8DF 50.692449 -3.813964 None politician Q2296770-5 " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "df = pd.read_parquet(\"./data/historical_figures_with_errors_50k.parquet\")\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from splink.duckdb.duckdb_linker import DuckDBLinker\n", "from splink.duckdb import duckdb_comparison_library as cl\n", "settings = {\n", " \"link_type\": \"dedupe_only\",\n", " \"blocking_rules_to_generate_predictions\": [\n", " \"l.full_name = r.full_name\",\n", " \"substr(l.full_name,1,6) = substr(r.full_name,1,6) and l.dob = r.dob and l.birth_place = r.birth_place\",\n", " \"l.dob = r.dob and l.birth_place = r.birth_place\",\n", " \"l.postcode_fake = r.postcode_fake\",\n", " ],\n", " \"comparisons\": [\n", " cl.levenshtein_at_thresholds(\"full_name\", [1,3,5], term_frequency_adjustments=True),\n", " cl.levenshtein_at_thresholds(\"dob\", [1,2], term_frequency_adjustments=True),\n", " cl.levenshtein_at_thresholds(\"postcode_fake\", 2),\n", " cl.exact_match(\"birth_place\", term_frequency_adjustments=True),\n", " cl.exact_match(\"occupation\", term_frequency_adjustments=True),\n", " ], \n", " \n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "linker = DuckDBLinker(df, settings, set_up_basic_logging=False)\n", "deterministic_rules = [\n", " \"l.full_name = r.full_name\",\n", " \"l.postcode_fake = r.postcode_fake and l.dob = r.dob\",\n", "]\n", "\n", "linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "linker.estimate_u_using_random_sampling(target_rows=2e6)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", " -- WARNING --\n", "You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary. To produce predictions the following untrained trained parameters will use default values.\n", "Comparison: 'full_name':\n", " m values not fully trained\n", "Comparison: 'dob':\n", " m values not fully trained\n", "Comparison: 'postcode_fake':\n", " m values not fully trained\n", "Comparison: 'birth_place':\n", " m values not fully trained\n", "Comparison: 'occupation':\n", " m values not fully trained\n" ] } ], "source": [ "results = linker.predict(threshold_match_probability=0.9)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
match_weightmatch_probabilityunique_id_lunique_id_rfull_name_lfull_name_rgamma_full_namedob_ldob_rgamma_dobpostcode_fake_lpostcode_fake_rgamma_postcode_fakebirth_place_lbirth_place_rgamma_birth_placeoccupation_loccupation_rgamma_occupationmatch_key
031.4815281.000000Q90404618-1Q90404618-3emlie cliffordemlie clifford41861-01-011861-01-013WR11 7QPWR11 7QW1WychavonWychavon1playwrightplaywright10
131.4815281.000000Q90404618-2Q90404618-3emlie cliffordemlie clifford41861-01-011861-01-013WR11 7QPWR11 7QW1WychavonWychavon1playwrightplaywright10
214.0907410.999943Q2516590-3Q2516590-9william wattswilliam watts41860-06-07NaN-1SY5 7NTSY5 7NT2ShropshireNaN-1geologistNaN-10
354.7512971.000000Q631006-1Q631006-2moses gastermoses gaster41856-09-171856-09-173EX20 3PZEX20 3PZ2BucharestBucharest1rabbirabbi10
421.4282051.000000Q7795446-2Q7795446-3thomas barrythomas barry41560-01-011560-01-013CF14 5GHCF14 6TQ0CardiffCardiff1judgejudge10
\n", "
" ], "text/plain": [ " match_weight match_probability unique_id_l unique_id_r full_name_l \\\n", "0 31.481528 1.000000 Q90404618-1 Q90404618-3 emlie clifford \n", "1 31.481528 1.000000 Q90404618-2 Q90404618-3 emlie clifford \n", "2 14.090741 0.999943 Q2516590-3 Q2516590-9 william watts \n", "3 54.751297 1.000000 Q631006-1 Q631006-2 moses gaster \n", "4 21.428205 1.000000 Q7795446-2 Q7795446-3 thomas barry \n", "\n", " full_name_r gamma_full_name dob_l dob_r gamma_dob \\\n", "0 emlie clifford 4 1861-01-01 1861-01-01 3 \n", "1 emlie clifford 4 1861-01-01 1861-01-01 3 \n", "2 william watts 4 1860-06-07 NaN -1 \n", "3 moses gaster 4 1856-09-17 1856-09-17 3 \n", "4 thomas barry 4 1560-01-01 1560-01-01 3 \n", "\n", " postcode_fake_l postcode_fake_r gamma_postcode_fake birth_place_l \\\n", "0 WR11 7QP WR11 7QW 1 Wychavon \n", "1 WR11 7QP WR11 7QW 1 Wychavon \n", "2 SY5 7NT SY5 7NT 2 Shropshire \n", "3 EX20 3PZ EX20 3PZ 2 Bucharest \n", "4 CF14 5GH CF14 6TQ 0 Cardiff \n", "\n", " birth_place_r gamma_birth_place occupation_l occupation_r \\\n", "0 Wychavon 1 playwright playwright \n", "1 Wychavon 1 playwright playwright \n", "2 NaN -1 geologist NaN \n", "3 Bucharest 1 rabbi rabbi \n", "4 Cardiff 1 judge judge \n", "\n", " gamma_occupation match_key \n", "0 1 0 \n", "1 1 0 \n", "2 -1 0 \n", "3 1 0 \n", "4 1 0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results.as_pandas_dataframe(limit=5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" }, "vscode": { "interpreter": { "hash": "3b53fa520a31e303a9636a08ff10a3bbc14893ee50cb37445791fa59628fc75b" } } }, "nbformat": 4, "nbformat_minor": 4 }