{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Splink cluster studio\n",
"\n",
"This demo shows how to create an interactive visualisation of clusters.\n",
"\n",
"The first few steps use the model trained in the deduplication quickstart example"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import altair as alt\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"22/01/11 05:47:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
"Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
"22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n",
"22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n",
"22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.\n",
"22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.\n",
"22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.\n",
" \r"
]
}
],
"source": [
"from utility_functions.demo_utils import get_spark\n",
"spark = get_spark() # See utility_functions/demo_utils.py for how to set up Spark\n",
"df = spark.read.parquet(\"data/fake_1000.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"22/01/11 05:48:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
]
}
],
"source": [
"from splink import Splink\n",
"\n",
"settings = {\n",
" \"link_type\": \"dedupe_only\",\n",
" \"blocking_rules\": [\"l.surname = r.surname\",\n",
" \"l.first_name = r.first_name\",\n",
" \"l.dob = r.dob\",\n",
" \"l.email = r.email\",\n",
" ],\n",
" \"comparison_columns\": [\n",
" {\n",
" \"col_name\": \"first_name\",\n",
" \"num_levels\": 3,\n",
" \"term_frequency_adjustments\": True,\n",
" \"m_probabilities\": [\n",
" 0.3941434323787689,\n",
" 0.14060422778129578,\n",
" 0.4652523398399353,\n",
" ],\n",
" \"u_probabilities\": [\n",
" 0.9941955208778381,\n",
" 0.0028420439921319485,\n",
" 0.002962463302537799,\n",
" ],\n",
" },\n",
" {\n",
" \"col_name\": \"surname\",\n",
" \"num_levels\": 3,\n",
" \"term_frequency_adjustments\": True,\n",
" \"m_probabilities\": [\n",
" 0.3971782326698303,\n",
" 0.11397389322519302,\n",
" 0.48884785175323486,\n",
" ],\n",
" \"u_probabilities\": [\n",
" 0.9930331110954285,\n",
" 0.00222682929597795,\n",
" 0.004740049596875906,\n",
" ],\n",
" },\n",
" {\n",
" \"col_name\": \"dob\",\n",
" \"m_probabilities\": [0.38818904757499695, 0.6118109226226807],\n",
" \"u_probabilities\": [0.9997655749320984, 0.00023440067889168859],\n",
" },\n",
" {\n",
" \"col_name\": \"city\",\n",
" \"case_expression\": \"case\\n when city_l is null or city_r is null then -1\\n when city_l = city_r then 1\\n else 0 end as gamma_city\",\n",
" \"m_probabilities\": [0.29216697812080383, 0.7078329920768738],\n",
" \"u_probabilities\": [0.9105007648468018, 0.08949924260377884],\n",
" },\n",
" {\n",
" \"col_name\": \"email\",\n",
" \"m_probabilities\": [0.32461094856262207, 0.6753890514373779],\n",
" \"u_probabilities\": [0.999818742275238, 0.00018127892690245062],\n",
" },\n",
" ],\n",
" \"additional_columns_to_retain\": [\"group\"],\n",
" \"proportion_of_matches\": 0.005672720726579428,\n",
" }\n",
"\n",
"\n",
"\n",
"linker = Splink(settings, df, spark)\n",
"df_e = linker.manually_apply_fellegi_sunter_weights()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cluster_low | \n",
" unique_id | \n",
" first_name | \n",
" surname | \n",
" dob | \n",
" city | \n",
" email | \n",
" group | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" Julia | \n",
" Taylor | \n",
" 2016-01-27 | \n",
" London | \n",
" hannah88@powers.com | \n",
" 0 | \n",
"
\n",
" \n",
" 387 | \n",
" 0 | \n",
" 3 | \n",
" Julia | \n",
" Taylor | \n",
" 2015-10-29 | \n",
" None | \n",
" hannah88opowersc@m | \n",
" 0 | \n",
"
\n",
" \n",
" 644 | \n",
" 0 | \n",
" 0 | \n",
" Julia | \n",
" None | \n",
" 2015-10-29 | \n",
" London | \n",
" hannah88@powers.com | \n",
" 0 | \n",
"
\n",
" \n",
" 645 | \n",
" 0 | \n",
" 1 | \n",
" Julia | \n",
" Taylor | \n",
" 2015-07-31 | \n",
" London | \n",
" hannah88@powers.com | \n",
" 0 | \n",
"
\n",
" \n",
" 388 | \n",
" 4 | \n",
" 7 | \n",
" Noah | \n",
" Watson | \n",
" 2008-02-05 | \n",
" tolon | \n",
" matthew78@ballard-mcdonald.net | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cluster_low unique_id first_name surname dob city \\\n",
"0 0 2 Julia Taylor 2016-01-27 London \n",
"387 0 3 Julia Taylor 2015-10-29 None \n",
"644 0 0 Julia None 2015-10-29 London \n",
"645 0 1 Julia Taylor 2015-07-31 London \n",
"388 4 7 Noah Watson 2008-02-05 tolon \n",
"\n",
" email group \n",
"0 hannah88@powers.com 0 \n",
"387 hannah88opowersc@m 0 \n",
"644 hannah88@powers.com 0 \n",
"645 hannah88@powers.com 0 \n",
"388 matthew78@ballard-mcdonald.net 1 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Cluster at a probability threshold of 50%\n",
"from splink.cluster import clusters_at_thresholds\n",
"nodes_with_clusters = clusters_at_thresholds(df, df_e, {'cluster_low':0.75}, linker.model)\n",
"nodes_with_clusters.toPandas().sort_values(\"cluster_low\").head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
" cluster_low | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 16 | \n",
" 394 | \n",
"
\n",
" \n",
" 1 | \n",
" 10 | \n",
" 804 | \n",
"
\n",
" \n",
" 2 | \n",
" 10 | \n",
" 654 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count cluster_low\n",
"0 16 394\n",
"1 10 804\n",
"2 10 654"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Get a few of the largest clusters\n",
"nodes_with_clusters.createOrReplaceTempView(\"nodes_with_clusters\")\n",
"sql = \"\"\"\n",
"select count(*) as count, cluster_low\n",
"from nodes_with_clusters\n",
"group by cluster_low\n",
"order by count(*) desc\n",
"limit 10\n",
"\n",
"\"\"\"\n",
"largest_clusters = spark.sql(sql).toPandas().head(10)\n",
"display(largest_clusters.head(3))\n",
"cluster_ids = list(largest_clusters[\"cluster_low\"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[394, 804, 654, 279, 517, 301, 664, 105, 581, 194]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster_ids"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The visualisation needs a list of edges and nodes. `splink_cluster_studio` contains functions to create and format these tables ready for input into the vis\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from splink_cluster_studio import (\n",
" get_edges_corresponding_to_clusters_from_spark,\n",
" get_nodes_corresponding_to_clusters_from_spark,\n",
")\n",
"\n",
"nodes_for_vis_pd = get_nodes_corresponding_to_clusters_from_spark(\n",
" nodes_with_clusters, \"cluster_low\", cluster_ids\n",
")\n",
"edges_for_vis_pd = get_edges_corresponding_to_clusters_from_spark(\n",
" nodes_with_clusters, df_e, \"cluster_low\", cluster_ids\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Optionally, we can compute graph metrics, which will then be displayed in the vis. \n",
"\n",
"If we have ground truth clusters, this information will also be displayed in the vis\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from splink_cluster_studio import compute_node_metrics, compute_edge_metrics, compute_cluster_metrics\n",
"nodes_for_vis_pd = compute_node_metrics(nodes_for_vis_pd, edges_for_vis_pd, \"cluster_low\", ground_truth_cluster_colname=\"group\")\n",
"edges_for_vis_pd = compute_edge_metrics( edges_for_vis_pd, \"cluster_low\", ground_truth_cluster_colname=\"group\")\n",
"clusters_for_vis_pd = compute_cluster_metrics(edges_for_vis_pd, \"cluster_low\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The vis is rendered to a file, which you can load in your browser or dislplay in an iframe in Jupyter"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from splink_cluster_studio import render_html_vis\n",
"splink_settings_dict = linker.model.current_settings_obj.settings_dict\n",
"render_html_vis(\n",
" nodes_for_vis_pd, edges_for_vis_pd, splink_settings_dict, \"interactive_clusters.html\", \"cluster_low\", overwrite=True, df_cluster_metrics=clusters_for_vis_pd,)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Show outputted html file in iframe in Juptyer\n",
"from IPython.display import IFrame\n",
"\n",
"IFrame(\n",
" src=\"./interactive_clusters.html\", width=1400, height=1200\n",
") # Show outputted html file in iframe in Juptyer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}