{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2d37ceab",
   "metadata": {},
   "source": [
    "# Auffällige Mehrfachwerte je Provider\n",
    "Einige Felder können je `dataprovider_id` mehrere unterschiedliche Werte haben (z.B. durch Datenmischung).\n",
    "Dafür wird ein Solr JSON-Facet genutzt und anschließend client-seitig auf `nvals > 1` sowie 32-stellige IDs gefiltert.\n",
    "Hinweis: `unique(...)` ist in Solr oft ein schneller, näherungsweiser Distinct-Count (für das Finden von Kandidaten meist ausreichend)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9b396cd4",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-03T05:09:41.605416Z",
     "iopub.status.busy": "2026-04-03T05:09:41.605182Z",
     "iopub.status.idle": "2026-04-03T05:09:44.736280Z",
     "shell.execute_reply": "2026-04-03T05:09:44.735262Z"
    }
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# Organization-Daten (ddb-institution) laden -> DataFrame\n",
    "url_org = \"https://api.deutsche-digitale-bibliothek.de/2/search/index/organization/select\"\n",
    "params_org = {\n",
    "    \"q\": \"type:ddb-institution OR type:ddb-aggregator\",\n",
    "    \"fl\": \"id,label\",\n",
    "    \"rows\": 10000,\n",
    "    \"wt\": \"json\",\n",
    "}\n",
    "\n",
    "resp_org = requests.get(url_org, params=params_org, timeout=300)\n",
    "resp_org.raise_for_status()\n",
    "data_org = resp_org.json()\n",
    "\n",
    "docs = data_org.get(\"response\", {}).get(\"docs\", [])\n",
    "\n",
    "def _as_list(value):\n",
    "    if value is None:\n",
    "        return []\n",
    "    # Solr liefert bei *_fct i.d.R. Listen; zur Sicherheit wird auch ein Scalar zu einer Liste normalisiert\n",
    "    return value if isinstance(value, list) else [value]\n",
    "\n",
    "df_org = pd.DataFrame({\n",
    "    \"id\": [d.get(\"id\") for d in docs],\n",
    "    \"label\": [_as_list(d.get(\"label\")) for d in docs],\n",
    "})\n",
    "\n",
    "# df_org"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dfcc0661",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-03T05:09:44.738290Z",
     "iopub.status.busy": "2026-04-03T05:09:44.738002Z",
     "iopub.status.idle": "2026-04-03T05:09:52.330638Z",
     "shell.execute_reply": "2026-04-03T05:09:52.329673Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>dataprovider_id</th>\n",
       "      <th>dataprovider_fct_nvalues</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7A5GWWGTIAUDXLN6JNTFKKZUM27GCGGI</td>\n",
       "      <td>2</td>\n",
       "      <td>[Staatliche Kunstsammlungen Dresden. GRASSI Mu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ZCXCMB7WXARQK27QKZY6ZQSH23D4YEOA</td>\n",
       "      <td>2</td>\n",
       "      <td>[Queer*Feministische Bibliothek und Archiv LIE...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    dataprovider_id  dataprovider_fct_nvalues  \\\n",
       "0  7A5GWWGTIAUDXLN6JNTFKKZUM27GCGGI                         2   \n",
       "1  ZCXCMB7WXARQK27QKZY6ZQSH23D4YEOA                         2   \n",
       "\n",
       "                                               label  \n",
       "0  [Staatliche Kunstsammlungen Dresden. GRASSI Mu...  \n",
       "1  [Queer*Feministische Bibliothek und Archiv LIE...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "url = \"https://api.deutsche-digitale-bibliothek.de/2/search/index/search/select\"\n",
    "params = {\n",
    "    \"q\": \"*:*\",\n",
    "    \"rows\": 0,\n",
    "    \"wt\": \"json\",\n",
    "    \"json.facet\": json.dumps({\n",
    "        \"providers\": {\n",
    "            \"type\": \"terms\",\n",
    "            \"field\": \"dataprovider_id\",\n",
    "            \"limit\": -1,\n",
    "            \"mincount\": 1,\n",
    "            \"facet\": {\n",
    "                # distinct count (i.d.R. HLL/approx, dafür schnell)\n",
    "                \"nvals\": \"unique(dataprovider_fct)\"\n",
    "            },\n",
    "            \"sort\": \"nvals desc\"\n",
    "        }\n",
    "    })\n",
    "}\n",
    "\n",
    "j = requests.get(url, params=params, timeout=300).json()\n",
    "buckets = j[\"facets\"][\"providers\"][\"buckets\"]\n",
    "\n",
    "# Nur auffällige Fälle\n",
    "auffaellig = [b for b in buckets if b.get(\"nvals\", 0) > 1]\n",
    "\n",
    "# -> DataFrame\n",
    "df_auffaellig = pd.DataFrame([\n",
    "    {\n",
    "        \"dataprovider_id\": str(b.get(\"val\", \"\")),\n",
    "        \"dataprovider_fct_nvalues\": int(b.get(\"nvals\", 0) or 0),\n",
    "    }\n",
    "    for b in auffaellig\n",
    "])\n",
    "\n",
    "if not df_auffaellig.empty:\n",
    "    # nur 32-stellige IDs behalten, falls df_auffaellig nicht leer ist\n",
    "    if not df_auffaellig.empty:\n",
    "        df_auffaellig = df_auffaellig[df_auffaellig[\"dataprovider_id\"].str.len() == 32].copy()\n",
    "\n",
    "    # Label aus df_org anreichern (falls df_org existiert)\n",
    "    df_auffaellig = (\n",
    "        df_auffaellig\n",
    "        .merge(df_org[[\"id\", \"label\"]], left_on=\"dataprovider_id\", right_on=\"id\", how=\"left\")\n",
    "        .drop(columns=[\"id\"])\n",
    "    )\n",
    "    display(df_auffaellig)\n",
    "else:\n",
    "    print(\"Keine Mehrfachwerte gefunden!👍\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7fe49490",
   "metadata": {},
   "source": [
    "## Variante: Mehrfachwerte in `sector_fct`\n",
    "Gleiche Logik wie oben, aber für `sector_fct` (Sektor-Klassifikation).\n",
    "Ergebnis: `df_auffaellig`, angereichert mit `label` aus `df_join`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "85032a1c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-03T05:09:52.332581Z",
     "iopub.status.busy": "2026-04-03T05:09:52.332406Z",
     "iopub.status.idle": "2026-04-03T05:09:58.575702Z",
     "shell.execute_reply": "2026-04-03T05:09:58.574777Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Keine Mehrfachwerte gefunden!👍\n"
     ]
    }
   ],
   "source": [
    "url = \"https://api.deutsche-digitale-bibliothek.de/2/search/index/search/select\"\n",
    "params = {\n",
    "    \"q\": \"*:*\",\n",
    "    \"rows\": 0,\n",
    "    \"wt\": \"json\",\n",
    "    \"json.facet\": json.dumps({\n",
    "        \"providers\": {\n",
    "            \"type\": \"terms\",\n",
    "            \"field\": \"dataprovider_id\",\n",
    "            \"limit\": -1,\n",
    "            \"mincount\": 1,\n",
    "            \"facet\": {\n",
    "                # distinct count (i.d.R. HLL/approx, dafür schnell)\n",
    "                \"nvals\": \"unique(sector_fct)\"\n",
    "            },\n",
    "            \"sort\": \"nvals desc\"\n",
    "        }\n",
    "    })\n",
    "}\n",
    "\n",
    "j = requests.get(url, params=params, timeout=300).json()\n",
    "buckets = j[\"facets\"][\"providers\"][\"buckets\"]\n",
    "\n",
    "# Nur auffällige Fälle\n",
    "auffaellig = [b for b in buckets if b.get(\"nvals\", 0) > 1]\n",
    "\n",
    "# -> DataFrame\n",
    "df_auffaellig = pd.DataFrame([\n",
    "    {\n",
    "        \"dataprovider_id\": str(b.get(\"val\", \"\")),\n",
    "        \"sector_fct_nvalues\": int(b.get(\"nvals\", 0) or 0),\n",
    "    }\n",
    "    for b in auffaellig\n",
    "])\n",
    "\n",
    "# nur 32-stellige IDs behalten, falls df_auffaellig nicht leer ist\n",
    "if not df_auffaellig.empty:\n",
    "    df_auffaellig = df_auffaellig[df_auffaellig[\"dataprovider_id\"].str.len() == 32].copy()\n",
    "\n",
    "    # Label aus df_org anreichern (falls df_org existiert)\n",
    "    df_auffaellig = (\n",
    "        df_auffaellig\n",
    "        .merge(df_org[[\"id\", \"label\"]], left_on=\"dataprovider_id\", right_on=\"id\", how=\"left\")\n",
    "        .drop(columns=[\"id\"])\n",
    "    )\n",
    "    display(df_auffaellig)   \n",
    "else:\n",
    "    print(\"Keine Mehrfachwerte gefunden!👍\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}