{ "cells": [ { "cell_type": "markdown", "id": "2d37ceab", "metadata": {}, "source": [ "# Auffällige Mehrfachwerte je Provider\n", "Einige Felder können je `dataprovider_id` mehrere unterschiedliche Werte haben (z.B. durch Datenmischung).\n", "Dafür wird ein Solr JSON-Facet genutzt und anschließend client-seitig auf `nvals > 1` sowie 32-stellige IDs gefiltert.\n", "Hinweis: `unique(...)` ist in Solr oft ein schneller, näherungsweiser Distinct-Count (für das Finden von Kandidaten meist ausreichend)." ] }, { "cell_type": "code", "execution_count": 1, "id": "9b396cd4", "metadata": { "execution": { "iopub.execute_input": "2026-04-03T05:09:41.605416Z", "iopub.status.busy": "2026-04-03T05:09:41.605182Z", "iopub.status.idle": "2026-04-03T05:09:44.736280Z", "shell.execute_reply": "2026-04-03T05:09:44.735262Z" } }, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "import json\n", "\n", "# Organization-Daten (ddb-institution) laden -> DataFrame\n", "url_org = \"https://api.deutsche-digitale-bibliothek.de/2/search/index/organization/select\"\n", "params_org = {\n", " \"q\": \"type:ddb-institution OR type:ddb-aggregator\",\n", " \"fl\": \"id,label\",\n", " \"rows\": 10000,\n", " \"wt\": \"json\",\n", "}\n", "\n", "resp_org = requests.get(url_org, params=params_org, timeout=300)\n", "resp_org.raise_for_status()\n", "data_org = resp_org.json()\n", "\n", "docs = data_org.get(\"response\", {}).get(\"docs\", [])\n", "\n", "def _as_list(value):\n", " if value is None:\n", " return []\n", " # Solr liefert bei *_fct i.d.R. Listen; zur Sicherheit wird auch ein Scalar zu einer Liste normalisiert\n", " return value if isinstance(value, list) else [value]\n", "\n", "df_org = pd.DataFrame({\n", " \"id\": [d.get(\"id\") for d in docs],\n", " \"label\": [_as_list(d.get(\"label\")) for d in docs],\n", "})\n", "\n", "# df_org" ] }, { "cell_type": "code", "execution_count": 2, "id": "dfcc0661", "metadata": { "execution": { "iopub.execute_input": "2026-04-03T05:09:44.738290Z", "iopub.status.busy": "2026-04-03T05:09:44.738002Z", "iopub.status.idle": "2026-04-03T05:09:52.330638Z", "shell.execute_reply": "2026-04-03T05:09:52.329673Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dataprovider_iddataprovider_fct_nvalueslabel
07A5GWWGTIAUDXLN6JNTFKKZUM27GCGGI2[Staatliche Kunstsammlungen Dresden. GRASSI Mu...
1ZCXCMB7WXARQK27QKZY6ZQSH23D4YEOA2[Queer*Feministische Bibliothek und Archiv LIE...
\n", "
" ], "text/plain": [ " dataprovider_id dataprovider_fct_nvalues \\\n", "0 7A5GWWGTIAUDXLN6JNTFKKZUM27GCGGI 2 \n", "1 ZCXCMB7WXARQK27QKZY6ZQSH23D4YEOA 2 \n", "\n", " label \n", "0 [Staatliche Kunstsammlungen Dresden. GRASSI Mu... \n", "1 [Queer*Feministische Bibliothek und Archiv LIE... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "url = \"https://api.deutsche-digitale-bibliothek.de/2/search/index/search/select\"\n", "params = {\n", " \"q\": \"*:*\",\n", " \"rows\": 0,\n", " \"wt\": \"json\",\n", " \"json.facet\": json.dumps({\n", " \"providers\": {\n", " \"type\": \"terms\",\n", " \"field\": \"dataprovider_id\",\n", " \"limit\": -1,\n", " \"mincount\": 1,\n", " \"facet\": {\n", " # distinct count (i.d.R. HLL/approx, dafür schnell)\n", " \"nvals\": \"unique(dataprovider_fct)\"\n", " },\n", " \"sort\": \"nvals desc\"\n", " }\n", " })\n", "}\n", "\n", "j = requests.get(url, params=params, timeout=300).json()\n", "buckets = j[\"facets\"][\"providers\"][\"buckets\"]\n", "\n", "# Nur auffällige Fälle\n", "auffaellig = [b for b in buckets if b.get(\"nvals\", 0) > 1]\n", "\n", "# -> DataFrame\n", "df_auffaellig = pd.DataFrame([\n", " {\n", " \"dataprovider_id\": str(b.get(\"val\", \"\")),\n", " \"dataprovider_fct_nvalues\": int(b.get(\"nvals\", 0) or 0),\n", " }\n", " for b in auffaellig\n", "])\n", "\n", "if not df_auffaellig.empty:\n", " # nur 32-stellige IDs behalten, falls df_auffaellig nicht leer ist\n", " if not df_auffaellig.empty:\n", " df_auffaellig = df_auffaellig[df_auffaellig[\"dataprovider_id\"].str.len() == 32].copy()\n", "\n", " # Label aus df_org anreichern (falls df_org existiert)\n", " df_auffaellig = (\n", " df_auffaellig\n", " .merge(df_org[[\"id\", \"label\"]], left_on=\"dataprovider_id\", right_on=\"id\", how=\"left\")\n", " .drop(columns=[\"id\"])\n", " )\n", " display(df_auffaellig)\n", "else:\n", " print(\"Keine Mehrfachwerte gefunden!👍\")" ] }, { "cell_type": "markdown", "id": "7fe49490", "metadata": {}, "source": [ "## Variante: Mehrfachwerte in `sector_fct`\n", "Gleiche Logik wie oben, aber für `sector_fct` (Sektor-Klassifikation).\n", "Ergebnis: `df_auffaellig`, angereichert mit `label` aus `df_join`." ] }, { "cell_type": "code", "execution_count": 3, "id": "85032a1c", "metadata": { "execution": { "iopub.execute_input": "2026-04-03T05:09:52.332581Z", "iopub.status.busy": "2026-04-03T05:09:52.332406Z", "iopub.status.idle": "2026-04-03T05:09:58.575702Z", "shell.execute_reply": "2026-04-03T05:09:58.574777Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Keine Mehrfachwerte gefunden!👍\n" ] } ], "source": [ "url = \"https://api.deutsche-digitale-bibliothek.de/2/search/index/search/select\"\n", "params = {\n", " \"q\": \"*:*\",\n", " \"rows\": 0,\n", " \"wt\": \"json\",\n", " \"json.facet\": json.dumps({\n", " \"providers\": {\n", " \"type\": \"terms\",\n", " \"field\": \"dataprovider_id\",\n", " \"limit\": -1,\n", " \"mincount\": 1,\n", " \"facet\": {\n", " # distinct count (i.d.R. HLL/approx, dafür schnell)\n", " \"nvals\": \"unique(sector_fct)\"\n", " },\n", " \"sort\": \"nvals desc\"\n", " }\n", " })\n", "}\n", "\n", "j = requests.get(url, params=params, timeout=300).json()\n", "buckets = j[\"facets\"][\"providers\"][\"buckets\"]\n", "\n", "# Nur auffällige Fälle\n", "auffaellig = [b for b in buckets if b.get(\"nvals\", 0) > 1]\n", "\n", "# -> DataFrame\n", "df_auffaellig = pd.DataFrame([\n", " {\n", " \"dataprovider_id\": str(b.get(\"val\", \"\")),\n", " \"sector_fct_nvalues\": int(b.get(\"nvals\", 0) or 0),\n", " }\n", " for b in auffaellig\n", "])\n", "\n", "# nur 32-stellige IDs behalten, falls df_auffaellig nicht leer ist\n", "if not df_auffaellig.empty:\n", " df_auffaellig = df_auffaellig[df_auffaellig[\"dataprovider_id\"].str.len() == 32].copy()\n", "\n", " # Label aus df_org anreichern (falls df_org existiert)\n", " df_auffaellig = (\n", " df_auffaellig\n", " .merge(df_org[[\"id\", \"label\"]], left_on=\"dataprovider_id\", right_on=\"id\", how=\"left\")\n", " .drop(columns=[\"id\"])\n", " )\n", " display(df_auffaellig) \n", "else:\n", " print(\"Keine Mehrfachwerte gefunden!👍\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.13" } }, "nbformat": 4, "nbformat_minor": 5 }