{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "fb29d421",
"metadata": {},
"outputs": [],
"source": [
"from splink.duckdb.duckdb_linker import DuckDBLinker"
]
},
{
"cell_type": "markdown",
"id": "7b0dedd9",
"metadata": {},
"source": [
"## Read in data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "bbfdc70c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" unique_id | \n",
" first_name | \n",
" surname | \n",
" dob | \n",
" city | \n",
" email | \n",
" cluster | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" Robert | \n",
" Alan | \n",
" 1971-06-24 | \n",
" NaN | \n",
" robert255@smith.net | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" Robert | \n",
" Allen | \n",
" 1971-05-24 | \n",
" NaN | \n",
" roberta25@smith.net | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" Rob | \n",
" Allen | \n",
" 1971-06-24 | \n",
" London | \n",
" roberta25@smith.net | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" Robert | \n",
" Alen | \n",
" 1971-06-24 | \n",
" Lonon | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" Grace | \n",
" NaN | \n",
" 1997-04-26 | \n",
" Hull | \n",
" grace.kelly52@jones.com | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" unique_id first_name surname dob city email \\\n",
"0 0 Robert Alan 1971-06-24 NaN robert255@smith.net \n",
"1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net \n",
"2 2 Rob Allen 1971-06-24 London roberta25@smith.net \n",
"3 3 Robert Alen 1971-06-24 Lonon NaN \n",
"4 4 Grace NaN 1997-04-26 Hull grace.kelly52@jones.com \n",
"\n",
" cluster \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 1 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd \n",
"pd.options.display.max_rows = 1000\n",
"df = pd.read_csv(\"./data/fake_1000.csv\")\n",
"df.head(5)"
]
},
{
"cell_type": "markdown",
"id": "e10eb84f",
"metadata": {},
"source": [
"## Initialise the linker, passing in the input dataset(s)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c6a8b5e9",
"metadata": {},
"outputs": [],
"source": [
"\n",
"linker = DuckDBLinker(df)"
]
},
{
"cell_type": "markdown",
"id": "f39f379d",
"metadata": {},
"source": [
"## Load estimated model parameters from previous notebook"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c8d9f230",
"metadata": {},
"outputs": [],
"source": [
"linker.load_settings_from_json(\"./demo_settings/saved_model_from_demo.json\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d8c3503",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "f5b41cce",
"metadata": {},
"source": [
"## Accuracy analysis\n",
"\n",
"Since we have labels in this dataset, we can compute the accuracy of our trained model"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fc562d19",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" unique_id_l | \n",
" source_dataset_l | \n",
" unique_id_r | \n",
" source_dataset_r | \n",
" clerical_match_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" fake_1000 | \n",
" 1 | \n",
" fake_1000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" fake_1000 | \n",
" 2 | \n",
" fake_1000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0 | \n",
" fake_1000 | \n",
" 3 | \n",
" fake_1000 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" fake_1000 | \n",
" 4 | \n",
" fake_1000 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0 | \n",
" fake_1000 | \n",
" 5 | \n",
" fake_1000 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" unique_id_l source_dataset_l unique_id_r source_dataset_r \\\n",
"0 0 fake_1000 1 fake_1000 \n",
"1 0 fake_1000 2 fake_1000 \n",
"2 0 fake_1000 3 fake_1000 \n",
"3 0 fake_1000 4 fake_1000 \n",
"4 0 fake_1000 5 fake_1000 \n",
"\n",
" clerical_match_score \n",
"0 1.0 \n",
"1 1.0 \n",
"2 1.0 \n",
"3 0.0 \n",
"4 0.0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_labels = pd.read_csv(\"./data/fake_1000_labels.csv\")\n",
"df_labels.head(5)"
]
},
{
"cell_type": "markdown",
"id": "81e4396d",
"metadata": {},
"source": [
"Then to produce the chart:"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "7b308339",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_predictions = linker.predict()\n",
"linker._initialise_df_concat_with_tf()\n",
"linker._con.register(\"labels\", df_labels)"
]
},
{
"cell_type": "markdown",
"id": "72a122e1",
"metadata": {},
"source": [
"### Plot ROC Curve\n",
"using `roc_chart_from_labels`"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "01dd7eec",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linker.roc_chart_from_labels(\"labels\")"
]
},
{
"cell_type": "markdown",
"id": "9f749c3c",
"metadata": {},
"source": [
"### Plot precision-recall chart\n",
"\n",
"A precision-recall chart is also available with `linker.precision_recall_from_labels`"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "18d25327",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
""
],
"text/plain": [
""
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linker.precision_recall_chart_from_labels(\"labels\")"
]
},
{
"cell_type": "markdown",
"id": "dfc81001",
"metadata": {},
"source": [
"## Splink comparison viewer\n",
"\n",
"Create a [splink_comparison_viewer](https://www.youtube.com/watch?v=DNvCMqjipis) interactive dashboard and display in an iframe"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "9e29f677",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linker.comparison_viewer_dashboard(df_predictions, \"scv.html\", True,2)\n",
"\n",
"from IPython.display import IFrame\n",
"\n",
"IFrame(\n",
" src=\"./scv.html\", width=\"100%\", height=1200\n",
") "
]
},
{
"cell_type": "markdown",
"id": "51b954c4",
"metadata": {},
"source": [
"## Clustering and visualising clusters"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "63d30ebc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Completed iteration 1, root rows count 32\n",
"Completed iteration 2, root rows count 21\n",
"Completed iteration 3, root rows count 11\n",
"Completed iteration 4, root rows count 8\n",
"Completed iteration 5, root rows count 3\n",
"Completed iteration 6, root rows count 1\n",
"Completed iteration 7, root rows count 0\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cluster_id | \n",
" unique_id | \n",
" first_name | \n",
" surname | \n",
" dob | \n",
" city | \n",
" email | \n",
" cluster | \n",
" tf_city | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" Robert | \n",
" Alan | \n",
" 1971-06-24 | \n",
" NaN | \n",
" robert255@smith.net | \n",
" 0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 1 | \n",
" Robert | \n",
" Allen | \n",
" 1971-05-24 | \n",
" NaN | \n",
" roberta25@smith.net | \n",
" 0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 2 | \n",
" Rob | \n",
" Allen | \n",
" 1971-06-24 | \n",
" London | \n",
" roberta25@smith.net | \n",
" 0 | \n",
" 0.212792 | \n",
"
\n",
" \n",
" | 3 | \n",
" 0 | \n",
" 3 | \n",
" Robert | \n",
" Alen | \n",
" 1971-06-24 | \n",
" Lonon | \n",
" NaN | \n",
" 0 | \n",
" 0.007380 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 4 | \n",
" Grace | \n",
" NaN | \n",
" 1997-04-26 | \n",
" Hull | \n",
" grace.kelly52@jones.com | \n",
" 1 | \n",
" 0.001230 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cluster_id unique_id first_name surname dob city \\\n",
"0 0 0 Robert Alan 1971-06-24 NaN \n",
"1 1 1 Robert Allen 1971-05-24 NaN \n",
"2 1 2 Rob Allen 1971-06-24 London \n",
"3 0 3 Robert Alen 1971-06-24 Lonon \n",
"4 4 4 Grace NaN 1997-04-26 Hull \n",
"\n",
" email cluster tf_city \n",
"0 robert255@smith.net 0 NaN \n",
"1 roberta25@smith.net 0 NaN \n",
"2 roberta25@smith.net 0 0.212792 \n",
"3 NaN 0 0.007380 \n",
"4 grace.kelly52@jones.com 1 0.001230 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predictions, 0.2)\n",
"df_clustered.as_pandas_dataframe(limit=5)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "c1154d41",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"linker.cluster_studio_dashboard(df_predictions,df_clustered, sampling_method=\"by_cluster_size\", out_path=\"cluster_studio.html\", overwrite=True)\n",
"\n",
"from IPython.display import IFrame\n",
"\n",
"IFrame(\n",
" src=\"./cluster_studio.html\", width=\"100%\", height=1200\n",
") "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48b76176",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}