{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "fb29d421", "metadata": {}, "outputs": [], "source": [ "from splink.duckdb.duckdb_linker import DuckDBLinker" ] }, { "cell_type": "markdown", "id": "7b0dedd9", "metadata": {}, "source": [ "## Read in data" ] }, { "cell_type": "code", "execution_count": 2, "id": "bbfdc70c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
unique_idfirst_namesurnamedobcityemailcluster
00RobertAlan1971-06-24NaNrobert255@smith.net0
11RobertAllen1971-05-24NaNroberta25@smith.net0
22RobAllen1971-06-24Londonroberta25@smith.net0
33RobertAlen1971-06-24LononNaN0
44GraceNaN1997-04-26Hullgrace.kelly52@jones.com1
\n", "
" ], "text/plain": [ " unique_id first_name surname dob city email \\\n", "0 0 Robert Alan 1971-06-24 NaN robert255@smith.net \n", "1 1 Robert Allen 1971-05-24 NaN roberta25@smith.net \n", "2 2 Rob Allen 1971-06-24 London roberta25@smith.net \n", "3 3 Robert Alen 1971-06-24 Lonon NaN \n", "4 4 Grace NaN 1997-04-26 Hull grace.kelly52@jones.com \n", "\n", " cluster \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd \n", "pd.options.display.max_rows = 1000\n", "df = pd.read_csv(\"./data/fake_1000.csv\")\n", "df.head(5)" ] }, { "cell_type": "markdown", "id": "e10eb84f", "metadata": {}, "source": [ "## Initialise the linker, passing in the input dataset(s)" ] }, { "cell_type": "code", "execution_count": 3, "id": "c6a8b5e9", "metadata": {}, "outputs": [], "source": [ "\n", "linker = DuckDBLinker(df)" ] }, { "cell_type": "markdown", "id": "f39f379d", "metadata": {}, "source": [ "## Load estimated model parameters from previous notebook" ] }, { "cell_type": "code", "execution_count": 4, "id": "c8d9f230", "metadata": {}, "outputs": [], "source": [ "linker.load_settings_from_json(\"./demo_settings/saved_model_from_demo.json\")" ] }, { "cell_type": "code", "execution_count": null, "id": "2d8c3503", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "f5b41cce", "metadata": {}, "source": [ "## Accuracy analysis\n", "\n", "Since we have labels in this dataset, we can compute the accuracy of our trained model" ] }, { "cell_type": "code", "execution_count": 12, "id": "fc562d19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
unique_id_lsource_dataset_lunique_id_rsource_dataset_rclerical_match_score
00fake_10001fake_10001.0
10fake_10002fake_10001.0
20fake_10003fake_10001.0
30fake_10004fake_10000.0
40fake_10005fake_10000.0
\n", "
" ], "text/plain": [ " unique_id_l source_dataset_l unique_id_r source_dataset_r \\\n", "0 0 fake_1000 1 fake_1000 \n", "1 0 fake_1000 2 fake_1000 \n", "2 0 fake_1000 3 fake_1000 \n", "3 0 fake_1000 4 fake_1000 \n", "4 0 fake_1000 5 fake_1000 \n", "\n", " clerical_match_score \n", "0 1.0 \n", "1 1.0 \n", "2 1.0 \n", "3 0.0 \n", "4 0.0 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_labels = pd.read_csv(\"./data/fake_1000_labels.csv\")\n", "df_labels.head(5)" ] }, { "cell_type": "markdown", "id": "81e4396d", "metadata": {}, "source": [ "Then to produce the chart:" ] }, { "cell_type": "code", "execution_count": 31, "id": "7b308339", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_predictions = linker.predict()\n", "linker._initialise_df_concat_with_tf()\n", "linker._con.register(\"labels\", df_labels)" ] }, { "cell_type": "markdown", "id": "72a122e1", "metadata": {}, "source": [ "### Plot ROC Curve\n", "using `roc_chart_from_labels`" ] }, { "cell_type": "code", "execution_count": 32, "id": "01dd7eec", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "linker.roc_chart_from_labels(\"labels\")" ] }, { "cell_type": "markdown", "id": "9f749c3c", "metadata": {}, "source": [ "### Plot precision-recall chart\n", "\n", "A precision-recall chart is also available with `linker.precision_recall_from_labels`" ] }, { "cell_type": "code", "execution_count": 33, "id": "18d25327", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "linker.precision_recall_chart_from_labels(\"labels\")" ] }, { "cell_type": "markdown", "id": "dfc81001", "metadata": {}, "source": [ "## Splink comparison viewer\n", "\n", "Create a [splink_comparison_viewer](https://www.youtube.com/watch?v=DNvCMqjipis) interactive dashboard and display in an iframe" ] }, { "cell_type": "code", "execution_count": 34, "id": "9e29f677", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "linker.comparison_viewer_dashboard(df_predictions, \"scv.html\", True,2)\n", "\n", "from IPython.display import IFrame\n", "\n", "IFrame(\n", " src=\"./scv.html\", width=\"100%\", height=1200\n", ") " ] }, { "cell_type": "markdown", "id": "51b954c4", "metadata": {}, "source": [ "## Clustering and visualising clusters" ] }, { "cell_type": "code", "execution_count": 36, "id": "63d30ebc", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Completed iteration 1, root rows count 32\n", "Completed iteration 2, root rows count 21\n", "Completed iteration 3, root rows count 11\n", "Completed iteration 4, root rows count 8\n", "Completed iteration 5, root rows count 3\n", "Completed iteration 6, root rows count 1\n", "Completed iteration 7, root rows count 0\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cluster_idunique_idfirst_namesurnamedobcityemailclustertf_city
000RobertAlan1971-06-24NaNrobert255@smith.net0NaN
111RobertAllen1971-05-24NaNroberta25@smith.net0NaN
212RobAllen1971-06-24Londonroberta25@smith.net00.212792
303RobertAlen1971-06-24LononNaN00.007380
444GraceNaN1997-04-26Hullgrace.kelly52@jones.com10.001230
\n", "
" ], "text/plain": [ " cluster_id unique_id first_name surname dob city \\\n", "0 0 0 Robert Alan 1971-06-24 NaN \n", "1 1 1 Robert Allen 1971-05-24 NaN \n", "2 1 2 Rob Allen 1971-06-24 London \n", "3 0 3 Robert Alen 1971-06-24 Lonon \n", "4 4 4 Grace NaN 1997-04-26 Hull \n", "\n", " email cluster tf_city \n", "0 robert255@smith.net 0 NaN \n", "1 roberta25@smith.net 0 NaN \n", "2 roberta25@smith.net 0 0.212792 \n", "3 NaN 0 0.007380 \n", "4 grace.kelly52@jones.com 1 0.001230 " ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_clustered = linker.cluster_pairwise_predictions_at_threshold(df_predictions, 0.2)\n", "df_clustered.as_pandas_dataframe(limit=5)" ] }, { "cell_type": "code", "execution_count": 37, "id": "c1154d41", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "linker.cluster_studio_dashboard(df_predictions,df_clustered, sampling_method=\"by_cluster_size\", out_path=\"cluster_studio.html\", overwrite=True)\n", "\n", "from IPython.display import IFrame\n", "\n", "IFrame(\n", " src=\"./cluster_studio.html\", width=\"100%\", height=1200\n", ") " ] }, { "cell_type": "code", "execution_count": null, "id": "48b76176", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 5 }