{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Splink cluster studio\n",
    "\n",
    "This demo shows how to create an interactive visualisation of clusters.\n",
    "\n",
    "The first few steps use the model trained in the deduplication quickstart example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import altair as alt\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/01/11 05:47:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
      "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
      "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n",
      "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n",
      "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.\n",
      "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.\n",
      "22/01/11 05:47:56 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.\n",
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "from utility_functions.demo_utils import get_spark\n",
    "spark = get_spark()  # See utility_functions/demo_utils.py for how to set up Spark\n",
    "df = spark.read.parquet(\"data/fake_1000.parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/01/11 05:48:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
     ]
    }
   ],
   "source": [
    "from splink import Splink\n",
    "\n",
    "settings = {\n",
    "    \"link_type\": \"dedupe_only\",\n",
    "    \"blocking_rules\": [\"l.surname = r.surname\",\n",
    "                       \"l.first_name = r.first_name\",\n",
    "                       \"l.dob = r.dob\",\n",
    "                       \"l.email = r.email\",\n",
    "                        ],\n",
    "    \"comparison_columns\": [\n",
    "        {\n",
    "            \"col_name\": \"first_name\",\n",
    "            \"num_levels\": 3,\n",
    "            \"term_frequency_adjustments\": True,\n",
    "            \"m_probabilities\": [\n",
    "                0.3941434323787689,\n",
    "                0.14060422778129578,\n",
    "                0.4652523398399353,\n",
    "            ],\n",
    "            \"u_probabilities\": [\n",
    "                0.9941955208778381,\n",
    "                0.0028420439921319485,\n",
    "                0.002962463302537799,\n",
    "            ],\n",
    "        },\n",
    "        {\n",
    "            \"col_name\": \"surname\",\n",
    "            \"num_levels\": 3,\n",
    "            \"term_frequency_adjustments\": True,\n",
    "            \"m_probabilities\": [\n",
    "                0.3971782326698303,\n",
    "                0.11397389322519302,\n",
    "                0.48884785175323486,\n",
    "            ],\n",
    "            \"u_probabilities\": [\n",
    "                0.9930331110954285,\n",
    "                0.00222682929597795,\n",
    "                0.004740049596875906,\n",
    "            ],\n",
    "        },\n",
    "        {\n",
    "            \"col_name\": \"dob\",\n",
    "            \"m_probabilities\": [0.38818904757499695, 0.6118109226226807],\n",
    "            \"u_probabilities\": [0.9997655749320984, 0.00023440067889168859],\n",
    "        },\n",
    "        {\n",
    "            \"col_name\": \"city\",\n",
    "          \"case_expression\": \"case\\n    when city_l is null or city_r is null then -1\\n    when city_l = city_r then 1\\n    else 0 end as gamma_city\",\n",
    "            \"m_probabilities\": [0.29216697812080383, 0.7078329920768738],\n",
    "            \"u_probabilities\": [0.9105007648468018, 0.08949924260377884],\n",
    "        },\n",
    "        {\n",
    "            \"col_name\": \"email\",\n",
    "           \"m_probabilities\": [0.32461094856262207, 0.6753890514373779],\n",
    "            \"u_probabilities\": [0.999818742275238, 0.00018127892690245062],\n",
    "        },\n",
    "    ],\n",
    "    \"additional_columns_to_retain\": [\"group\"],\n",
    "    \"proportion_of_matches\": 0.005672720726579428,\n",
    "    }\n",
    "\n",
    "\n",
    "\n",
    "linker = Splink(settings, df, spark)\n",
    "df_e = linker.manually_apply_fellegi_sunter_weights()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cluster_low</th>\n",
       "      <th>unique_id</th>\n",
       "      <th>first_name</th>\n",
       "      <th>surname</th>\n",
       "      <th>dob</th>\n",
       "      <th>city</th>\n",
       "      <th>email</th>\n",
       "      <th>group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>Julia</td>\n",
       "      <td>Taylor</td>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>London</td>\n",
       "      <td>hannah88@powers.com</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>387</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Julia</td>\n",
       "      <td>Taylor</td>\n",
       "      <td>2015-10-29</td>\n",
       "      <td>None</td>\n",
       "      <td>hannah88opowersc@m</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>644</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Julia</td>\n",
       "      <td>None</td>\n",
       "      <td>2015-10-29</td>\n",
       "      <td>London</td>\n",
       "      <td>hannah88@powers.com</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>645</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Julia</td>\n",
       "      <td>Taylor</td>\n",
       "      <td>2015-07-31</td>\n",
       "      <td>London</td>\n",
       "      <td>hannah88@powers.com</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>388</th>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>Noah</td>\n",
       "      <td>Watson</td>\n",
       "      <td>2008-02-05</td>\n",
       "      <td>tolon</td>\n",
       "      <td>matthew78@ballard-mcdonald.net</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     cluster_low  unique_id first_name surname         dob    city  \\\n",
       "0              0          2     Julia   Taylor  2016-01-27  London   \n",
       "387            0          3     Julia   Taylor  2015-10-29    None   \n",
       "644            0          0     Julia     None  2015-10-29  London   \n",
       "645            0          1     Julia   Taylor  2015-07-31  London   \n",
       "388            4          7      Noah   Watson  2008-02-05   tolon   \n",
       "\n",
       "                              email  group  \n",
       "0               hannah88@powers.com      0  \n",
       "387              hannah88opowersc@m      0  \n",
       "644             hannah88@powers.com      0  \n",
       "645             hannah88@powers.com      0  \n",
       "388  matthew78@ballard-mcdonald.net      1  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Cluster at a probability threshold of 50%\n",
    "from splink.cluster import clusters_at_thresholds\n",
    "nodes_with_clusters = clusters_at_thresholds(df, df_e, {'cluster_low':0.75}, linker.model)\n",
    "nodes_with_clusters.toPandas().sort_values(\"cluster_low\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>cluster_low</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>16</td>\n",
       "      <td>394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10</td>\n",
       "      <td>654</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   count  cluster_low\n",
       "0     16          394\n",
       "1     10          804\n",
       "2     10          654"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Get a few of the largest clusters\n",
    "nodes_with_clusters.createOrReplaceTempView(\"nodes_with_clusters\")\n",
    "sql = \"\"\"\n",
    "select count(*) as count, cluster_low\n",
    "from nodes_with_clusters\n",
    "group by cluster_low\n",
    "order by count(*) desc\n",
    "limit 10\n",
    "\n",
    "\"\"\"\n",
    "largest_clusters = spark.sql(sql).toPandas().head(10)\n",
    "display(largest_clusters.head(3))\n",
    "cluster_ids = list(largest_clusters[\"cluster_low\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[394, 804, 654, 279, 517, 301, 664, 105, 581, 194]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cluster_ids"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The visualisation needs a list of edges and nodes.  `splink_cluster_studio` contains functions to create and format these tables ready for input into the vis\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from splink_cluster_studio import (\n",
    "    get_edges_corresponding_to_clusters_from_spark,\n",
    "    get_nodes_corresponding_to_clusters_from_spark,\n",
    ")\n",
    "\n",
    "nodes_for_vis_pd = get_nodes_corresponding_to_clusters_from_spark(\n",
    "    nodes_with_clusters, \"cluster_low\", cluster_ids\n",
    ")\n",
    "edges_for_vis_pd = get_edges_corresponding_to_clusters_from_spark(\n",
    "    nodes_with_clusters, df_e, \"cluster_low\", cluster_ids\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Optionally, we can compute graph metrics, which will then be displayed in the vis.  \n",
    "\n",
    "If we have ground truth clusters, this information will also be displayed in the vis\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from splink_cluster_studio import compute_node_metrics, compute_edge_metrics, compute_cluster_metrics\n",
    "nodes_for_vis_pd = compute_node_metrics(nodes_for_vis_pd, edges_for_vis_pd, \"cluster_low\", ground_truth_cluster_colname=\"group\")\n",
    "edges_for_vis_pd = compute_edge_metrics( edges_for_vis_pd, \"cluster_low\", ground_truth_cluster_colname=\"group\")\n",
    "clusters_for_vis_pd = compute_cluster_metrics(edges_for_vis_pd, \"cluster_low\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The vis is rendered to a file, which you can load in your browser or dislplay in an iframe in Jupyter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from splink_cluster_studio import render_html_vis\n",
    "splink_settings_dict = linker.model.current_settings_obj.settings_dict\n",
    "render_html_vis(\n",
    "    nodes_for_vis_pd, edges_for_vis_pd, splink_settings_dict, \"interactive_clusters.html\", \"cluster_low\", overwrite=True, df_cluster_metrics=clusters_for_vis_pd,)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "        <iframe\n",
       "            width=\"1400\"\n",
       "            height=\"1200\"\n",
       "            src=\"./interactive_clusters.html\"\n",
       "            frameborder=\"0\"\n",
       "            allowfullscreen\n",
       "            \n",
       "        ></iframe>\n",
       "        "
      ],
      "text/plain": [
       "<IPython.lib.display.IFrame at 0x7f97587ae370>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Show outputted html file in iframe in Juptyer\n",
    "from IPython.display import IFrame\n",
    "\n",
    "IFrame(\n",
    "    src=\"./interactive_clusters.html\", width=1400, height=1200\n",
    ")  # Show outputted html file in iframe in Juptyer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}