{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Splink cluster studio\n", "\n", "This demo shows how to create an interactive visualisation of clusters.\n", "\n", "The first few steps use the model trained in the deduplication quickstart example" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import altair as alt\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "22/01/11 05:47:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n", "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.\n", "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.\n", "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.\n", " \r" ] } ], "source": [ "from utility_functions.demo_utils import get_spark\n", "spark = get_spark() # See utility_functions/demo_utils.py for how to set up Spark\n", "df = spark.read.parquet(\"data/fake_1000.parquet\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "22/01/11 05:48:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n" ] } ], "source": [ "from splink import Splink\n", "\n", "settings = {\n", " \"link_type\": \"dedupe_only\",\n", " \"blocking_rules\": [\"l.surname = r.surname\",\n", " \"l.first_name = r.first_name\",\n", " \"l.dob = r.dob\",\n", " \"l.email = r.email\",\n", " ],\n", " \"comparison_columns\": [\n", " {\n", " \"col_name\": \"first_name\",\n", " \"num_levels\": 3,\n", " \"term_frequency_adjustments\": True,\n", " \"m_probabilities\": [\n", " 0.3941434323787689,\n", " 0.14060422778129578,\n", " 0.4652523398399353,\n", " ],\n", " \"u_probabilities\": [\n", " 0.9941955208778381,\n", " 0.0028420439921319485,\n", " 0.002962463302537799,\n", " ],\n", " },\n", " {\n", " \"col_name\": \"surname\",\n", " \"num_levels\": 3,\n", " \"term_frequency_adjustments\": True,\n", " \"m_probabilities\": [\n", " 0.3971782326698303,\n", " 0.11397389322519302,\n", " 0.48884785175323486,\n", " ],\n", " \"u_probabilities\": [\n", " 0.9930331110954285,\n", " 0.00222682929597795,\n", " 0.004740049596875906,\n", " ],\n", " },\n", " {\n", " \"col_name\": \"dob\",\n", " \"m_probabilities\": [0.38818904757499695, 0.6118109226226807],\n", " \"u_probabilities\": [0.9997655749320984, 0.00023440067889168859],\n", " },\n", " {\n", " \"col_name\": \"city\",\n", " \"case_expression\": \"case\\n when city_l is null or city_r is null then -1\\n when city_l = city_r then 1\\n else 0 end as gamma_city\",\n", " \"m_probabilities\": [0.29216697812080383, 0.7078329920768738],\n", " \"u_probabilities\": [0.9105007648468018, 0.08949924260377884],\n", " },\n", " {\n", " \"col_name\": \"email\",\n", " \"m_probabilities\": [0.32461094856262207, 0.6753890514373779],\n", " \"u_probabilities\": [0.999818742275238, 0.00018127892690245062],\n", " },\n", " ],\n", " \"additional_columns_to_retain\": [\"group\"],\n", " \"proportion_of_matches\": 0.005672720726579428,\n", " }\n", "\n", "\n", "\n", "linker = Splink(settings, df, spark)\n", "df_e = linker.manually_apply_fellegi_sunter_weights()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cluster_lowunique_idfirst_namesurnamedobcityemailgroup
002JuliaTaylor2016-01-27Londonhannah88@powers.com0
38703JuliaTaylor2015-10-29Nonehannah88opowersc@m0
64400JuliaNone2015-10-29Londonhannah88@powers.com0
64501JuliaTaylor2015-07-31Londonhannah88@powers.com0
38847NoahWatson2008-02-05tolonmatthew78@ballard-mcdonald.net1
\n", "
" ], "text/plain": [ " cluster_low unique_id first_name surname dob city \\\n", "0 0 2 Julia Taylor 2016-01-27 London \n", "387 0 3 Julia Taylor 2015-10-29 None \n", "644 0 0 Julia None 2015-10-29 London \n", "645 0 1 Julia Taylor 2015-07-31 London \n", "388 4 7 Noah Watson 2008-02-05 tolon \n", "\n", " email group \n", "0 hannah88@powers.com 0 \n", "387 hannah88opowersc@m 0 \n", "644 hannah88@powers.com 0 \n", "645 hannah88@powers.com 0 \n", "388 matthew78@ballard-mcdonald.net 1 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cluster at a probability threshold of 50%\n", "from splink.cluster import clusters_at_thresholds\n", "nodes_with_clusters = clusters_at_thresholds(df, df_e, {'cluster_low':0.75}, linker.model)\n", "nodes_with_clusters.toPandas().sort_values(\"cluster_low\").head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countcluster_low
016394
110804
210654
\n", "
" ], "text/plain": [ " count cluster_low\n", "0 16 394\n", "1 10 804\n", "2 10 654" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Get a few of the largest clusters\n", "nodes_with_clusters.createOrReplaceTempView(\"nodes_with_clusters\")\n", "sql = \"\"\"\n", "select count(*) as count, cluster_low\n", "from nodes_with_clusters\n", "group by cluster_low\n", "order by count(*) desc\n", "limit 10\n", "\n", "\"\"\"\n", "largest_clusters = spark.sql(sql).toPandas().head(10)\n", "display(largest_clusters.head(3))\n", "cluster_ids = list(largest_clusters[\"cluster_low\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[394, 804, 654, 279, 517, 301, 664, 105, 581, 194]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cluster_ids" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The visualisation needs a list of edges and nodes. `splink_cluster_studio` contains functions to create and format these tables ready for input into the vis\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from splink_cluster_studio import (\n", " get_edges_corresponding_to_clusters_from_spark,\n", " get_nodes_corresponding_to_clusters_from_spark,\n", ")\n", "\n", "nodes_for_vis_pd = get_nodes_corresponding_to_clusters_from_spark(\n", " nodes_with_clusters, \"cluster_low\", cluster_ids\n", ")\n", "edges_for_vis_pd = get_edges_corresponding_to_clusters_from_spark(\n", " nodes_with_clusters, df_e, \"cluster_low\", cluster_ids\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Optionally, we can compute graph metrics, which will then be displayed in the vis. \n", "\n", "If we have ground truth clusters, this information will also be displayed in the vis\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from splink_cluster_studio import compute_node_metrics, compute_edge_metrics, compute_cluster_metrics\n", "nodes_for_vis_pd = compute_node_metrics(nodes_for_vis_pd, edges_for_vis_pd, \"cluster_low\", ground_truth_cluster_colname=\"group\")\n", "edges_for_vis_pd = compute_edge_metrics( edges_for_vis_pd, \"cluster_low\", ground_truth_cluster_colname=\"group\")\n", "clusters_for_vis_pd = compute_cluster_metrics(edges_for_vis_pd, \"cluster_low\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The vis is rendered to a file, which you can load in your browser or dislplay in an iframe in Jupyter" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from splink_cluster_studio import render_html_vis\n", "splink_settings_dict = linker.model.current_settings_obj.settings_dict\n", "render_html_vis(\n", " nodes_for_vis_pd, edges_for_vis_pd, splink_settings_dict, \"interactive_clusters.html\", \"cluster_low\", overwrite=True, df_cluster_metrics=clusters_for_vis_pd,)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Show outputted html file in iframe in Juptyer\n", "from IPython.display import IFrame\n", "\n", "IFrame(\n", " src=\"./interactive_clusters.html\", width=1400, height=1200\n", ") # Show outputted html file in iframe in Juptyer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }