{ "cells": [ { "cell_type": "markdown", "id": "2d37ceab", "metadata": {}, "source": [ "# Auffällige Mehrfachwerte je Provider\n", "Einige Felder können je `dataprovider_id` mehrere unterschiedliche Werte haben (z.B. durch Datenmischung).\n", "Dafür wird ein Solr JSON-Facet genutzt und anschließend client-seitig auf `nvals > 1` sowie 32-stellige IDs gefiltert.\n", "Hinweis: `unique(...)` ist in Solr oft ein schneller, näherungsweiser Distinct-Count (für das Finden von Kandidaten meist ausreichend)." ] }, { "cell_type": "code", "execution_count": 1, "id": "9b396cd4", "metadata": { "execution": { "iopub.execute_input": "2026-04-03T05:09:41.605416Z", "iopub.status.busy": "2026-04-03T05:09:41.605182Z", "iopub.status.idle": "2026-04-03T05:09:44.736280Z", "shell.execute_reply": "2026-04-03T05:09:44.735262Z" } }, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "import json\n", "\n", "# Organization-Daten (ddb-institution) laden -> DataFrame\n", "url_org = \"https://api.deutsche-digitale-bibliothek.de/2/search/index/organization/select\"\n", "params_org = {\n", " \"q\": \"type:ddb-institution OR type:ddb-aggregator\",\n", " \"fl\": \"id,label\",\n", " \"rows\": 10000,\n", " \"wt\": \"json\",\n", "}\n", "\n", "resp_org = requests.get(url_org, params=params_org, timeout=300)\n", "resp_org.raise_for_status()\n", "data_org = resp_org.json()\n", "\n", "docs = data_org.get(\"response\", {}).get(\"docs\", [])\n", "\n", "def _as_list(value):\n", " if value is None:\n", " return []\n", " # Solr liefert bei *_fct i.d.R. Listen; zur Sicherheit wird auch ein Scalar zu einer Liste normalisiert\n", " return value if isinstance(value, list) else [value]\n", "\n", "df_org = pd.DataFrame({\n", " \"id\": [d.get(\"id\") for d in docs],\n", " \"label\": [_as_list(d.get(\"label\")) for d in docs],\n", "})\n", "\n", "# df_org" ] }, { "cell_type": "code", "execution_count": 2, "id": "dfcc0661", "metadata": { "execution": { "iopub.execute_input": "2026-04-03T05:09:44.738290Z", "iopub.status.busy": "2026-04-03T05:09:44.738002Z", "iopub.status.idle": "2026-04-03T05:09:52.330638Z", "shell.execute_reply": "2026-04-03T05:09:52.329673Z" } }, "outputs": [ { "data": { "text/html": [ "
| \n", " | dataprovider_id | \n", "dataprovider_fct_nvalues | \n", "label | \n", "
|---|---|---|---|
| 0 | \n", "7A5GWWGTIAUDXLN6JNTFKKZUM27GCGGI | \n", "2 | \n", "[Staatliche Kunstsammlungen Dresden. GRASSI Mu... | \n", "
| 1 | \n", "ZCXCMB7WXARQK27QKZY6ZQSH23D4YEOA | \n", "2 | \n", "[Queer*Feministische Bibliothek und Archiv LIE... | \n", "