{ "cells": [ { "cell_type": "markdown", "id": "269f718f-3a08-4958-8a66-12918e51a3ec", "metadata": {}, "source": [ "# Harvest SRU API results as JSON\n", "\n", "You can query the People & Organisations data using the SRU (Search/Retrieve via URL) API. The easiest way to understand how to build SRU queries is to play around with the [online interface](http://www.nla.gov.au/apps/srw/search/peopleaustralia). More [information on the SRU protocol](https://www.loc.gov/standards/sru/) is available from the Library of Congress.\n", "\n", "Trove's people and organisation records are available in a number of XML formats, the richest and most complex of which is [EAC-CPF](https://eac.staatsbibliothek-berlin.de/). However, the XML records are not easy to work with, so to simplify further processing, this notebook queries the SRU interface and then converts the XML results into JSON." ] }, { "cell_type": "code", "execution_count": 1, "id": "c60c39de-30e9-4144-8af0-a8d1e68ac385", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "import requests_cache\n", "from bs4 import BeautifulSoup\n", "from IPython.display import JSON\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "from tqdm.auto import tqdm\n", "\n", "s = requests_cache.CachedSession()\n", "retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n", "s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n", "s.mount(\"http://\", HTTPAdapter(max_retries=retries))" ] }, { "cell_type": "code", "execution_count": 2, "id": "36a093b9-0709-4855-bdb6-79ec679f32c2", "metadata": {}, "outputs": [], "source": [ "# Available SRU parameters\n", "\n", "params = {\n", " # 'query': 'rec.identifier=\"http://nla.gov.au/nla.party-641680\"', # Can specify a particular property, it not searches all (?) fields\n", " \"query\": \"\",\n", " \"version\": \"1.1\",\n", " \"operation\": \"searchRetrieve\",\n", " \"recordSchema\": \"urn:isbn:1-931666-33-4\", # This specifies records in EAC-CPF format\n", " \"maximumRecords\": 100,\n", " \"startRecord\": 1,\n", " \"resultSetTTL\": 300,\n", " \"recordPacking\": \"xml\",\n", " \"recordXPath\": \"\",\n", " \"sortKeys\": \"\",\n", "}\n", "\n", "# SRU endpoint\n", "api_url = \"http://www.nla.gov.au/apps/srw/search/peopleaustralia\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "48c959ef-681d-4ac8-8b22-da03d25da5ff", "metadata": {}, "outputs": [], "source": [ "def get_total_results(params):\n", " params[\"maximumRecords\"] = 0\n", " response = s.get(api_url, params=params)\n", " soup = BeautifulSoup(response.content, \"xml\")\n", " return int(soup.find(\"numberOfRecords\").string)" ] }, { "cell_type": "code", "execution_count": 4, "id": "9c006708-34f0-48f9-af50-3cfe59075ebe", "metadata": {}, "outputs": [], "source": [ "def soup_string(elem, prop):\n", " \"\"\"\n", " Saves on memory by not keeping BS navigable string\n", " \"\"\"\n", " if value := elem.find(prop):\n", " string = str(value.string).strip()\n", " if string == \"None\":\n", " string = value.get_text()\n", " return string\n", "\n", "\n", "def get_attr(elem, prop, attr):\n", " if value := elem.find(prop):\n", " return value.attrs.get(attr)\n", "\n", "\n", "def get_date(elem, prop):\n", " try:\n", " date = elem.find(prop)[\"standardDateTime\"]\n", " except (KeyError):\n", " try:\n", " date = elem.find(prop)[\"standardDate\"]\n", " except KeyError:\n", " date = soup_string(elem, prop)\n", " except TypeError:\n", " date = None\n", " return date\n", "\n", "\n", "def get_dates(history):\n", " dates = {}\n", " if history:\n", " for event in history.find_all(\"maintenanceEvent\"):\n", " event_type = soup_string(event, \"eventType\")\n", " event_date = get_date(event, \"eventDateTime\")\n", " if event_type == \"created\":\n", " dates[\"date_created\"] = event_date\n", " elif event_type == \"updated\":\n", " dates[\"date_modified\"] = event_date\n", " return dates\n", "\n", "\n", "def get_names(identity):\n", " names = []\n", " for name_entry in identity.find_all(\"nameEntry\"):\n", " name = {}\n", " for part in name_entry.find_all(\"part\"):\n", " if part.has_attr(\"localType\"):\n", " name_type = part[\"localType\"]\n", " else:\n", " name_type = \"name\"\n", " try:\n", " name[name_type].append(str(part.string))\n", " except (KeyError, AttributeError):\n", " name[name_type] = [str(part.string)]\n", " if name_entry.find(\"authorizedForm\"):\n", " name[\"authorized\"] = True\n", " else:\n", " name[\"authorized\"] = False\n", " names.append(name)\n", " return names\n", "\n", "\n", "def get_exist_dates(description):\n", " exist_dates = {}\n", " dates = description.find(\"existDates\")\n", " if dates:\n", " exist_dates[\"date_from\"] = get_date(dates, \"fromDate\")\n", " exist_dates[\"date_to\"] = get_date(dates, \"toDate\")\n", " return exist_dates\n", "\n", "\n", "def get_places(description):\n", " places = []\n", " places_elem = description.find(\"places\")\n", " if places_elem:\n", " for place_entry in places_elem.find_all(\"place\"):\n", " place = {\n", " \"place_type\": soup_string(place_entry, \"placeRole\"),\n", " \"name\": soup_string(place_entry, \"placeEntry\"),\n", " \"date_from\": get_date(place_entry, \"fromDate\"),\n", " \"date_to\": get_date(place_entry, \"toDate\"),\n", " }\n", " places.append(place)\n", " return places\n", "\n", "\n", "def get_events(description):\n", " events = []\n", " for event_list in description.find_all(\"chronList\"):\n", " for event in event_list.find_all(\"chronItem\"):\n", " events.append(\n", " {\n", " \"name\": soup_string(event, \"event\"),\n", " \"date\": get_date(event, \"date\"),\n", " \"date_from\": get_date(event, \"fromDate\"),\n", " \"date_to\": get_date(event, \"toDate\"),\n", " }\n", " )\n", " return events\n", "\n", "\n", "def get_occupations(description):\n", " occupations = []\n", " if occupation_list := description.find(\"occupations\"):\n", " for occupation in occupation_list.find_all(\"occupation\"):\n", " occupations.append(soup_string(occupation, \"term\"))\n", " return occupations\n", "\n", "\n", "def get_related_entities(eac):\n", " related = []\n", " for relation in eac.find_all(\"cpfRelation\"):\n", " # Can be resourceRelation or cpfRelation\n", " if description := relation.find(\"descriptiveNote\"):\n", " description = description.get_text().strip()\n", " else:\n", " description = None\n", " related.append(\n", " {\n", " \"relation_type\": relation.attrs.get(\"cpfRelationType\"),\n", " \"href\": relation.attrs.get(\"href\"),\n", " \"name\": soup_string(relation, \"relationEntry\"),\n", " \"entity_type\": get_attr(relation, \"relationEntry\", \"localType\"),\n", " \"date_from\": get_date(relation, \"fromDate\"),\n", " \"date_to\": get_date(relation, \"toDate\"),\n", " \"description\": description,\n", " }\n", " )\n", " return related\n", "\n", "\n", "def get_related_resources(eac):\n", " related = []\n", " for relation in eac.find_all(\"resourceRelation\"):\n", " # Can be resourceRelation or cpfRelation\n", " relation_type = relation.attrs.get(\"resourceRelationType\")\n", " if relation.find(\"dc\"):\n", " if description := relation.find_all(\"description\"):\n", " description = \" \".join([d.get_text() for d in description])\n", " related.append(\n", " {\n", " \"relation_type\": relation_type,\n", " \"href\": soup_string(relation, \"identifier\"),\n", " \"name\": soup_string(relation, \"title\"),\n", " \"resource_type\": None,\n", " \"contributor\": soup_string(relation, \"contributor\"),\n", " \"date\": soup_string(relation, \"date\"),\n", " \"description\": description,\n", " }\n", " )\n", " else:\n", " if description := relation.find(\"abstract\"):\n", " description = description.get_text()\n", " related.append(\n", " {\n", " \"relation_type\": relation_type,\n", " \"href\": relation.attrs.get(\"href\"),\n", " \"name\": soup_string(relation, \"relationEntry\"),\n", " \"resource_type\": get_attr(relation, \"relationEntry\", \"localType\"),\n", " \"contributor\": soup_string(relation, \"name\"),\n", " \"date\": soup_string(relation, \"date\"),\n", " \"description\": description,\n", " }\n", " )\n", " return related\n", "\n", "\n", "def get_biog(description):\n", " biog = []\n", " for bio in description.find_all(\"biogHist\"):\n", " for para in bio.find_all(\"p\"):\n", " biog.append(str(para.string).strip())\n", " return \" \".join(biog)\n", "\n", "\n", "def get_sources(eac):\n", " sources = []\n", " for source_eac in eac.find_all(\"eac-cpf\"):\n", " source = process_eac(source_eac)\n", " source[\"related_entities\"] = get_related_entities(source_eac)\n", " source[\"related_resources\"] = get_related_resources(source_eac)\n", " sources.append(source)\n", " return sources\n", "\n", "\n", "def get_agency_details(agency_element):\n", " agency = {\n", " \"agency_id\": soup_string(agency_element, \"agencyCode\"),\n", " \"agency_name\": soup_string(agency_element, \"agencyName\"),\n", " }\n", " return agency\n", "\n", "\n", "def get_eac_meta(eac):\n", " meta = {\"record_id\": soup_string(eac, \"recordId\")}\n", " control = eac.find(\"control\")\n", " # agency\n", " meta.update(get_agency_details(control.find(\"maintenanceAgency\")))\n", " meta.update(get_dates(control.find(\"maintenanceHistory\")))\n", " return meta\n", "\n", "\n", "def format_name(names, entity_type):\n", " authorized = None\n", " combined_names = []\n", " for name in names:\n", " if name[\"authorized\"] is True:\n", " authorized = name\n", " break\n", " if not authorized:\n", " try:\n", " authorized = names[0]\n", " except IndexError:\n", " pass\n", " if authorized:\n", " for name_type in [\"forename\", \"surname\", \"name\", \"parent\"]:\n", " combined_names += authorized.get(name_type, [])\n", " return \" \".join(combined_names)\n", "\n", "\n", "def process_eac(eac):\n", " record = get_eac_meta(eac)\n", " identity = eac.find(\"identity\")\n", " record[\"names\"] = get_names(identity)\n", " record[\"entity_type\"] = soup_string(identity, \"entityType\")\n", " record[\"entity_id\"] = soup_string(identity, \"entityId\")\n", " record[\"name\"] = format_name(record[\"names\"], record[\"entity_type\"])\n", " description = eac.find(\"description\")\n", " if not description:\n", " description = eac.find(\"cpfDescription\")\n", " record[\"dates\"] = get_exist_dates(description)\n", " record[\"places\"] = get_places(description)\n", " record[\"occupations\"] = get_occupations(description)\n", " record[\"abstract\"] = soup_string(description, \"abstract\")\n", " record[\"description\"] = get_biog(description)\n", " record[\"events\"] = get_events(description)\n", " record[\"sources\"] = get_sources(eac)\n", " return record\n", "\n", "\n", "def get_records(params):\n", " records = []\n", " response = s.get(api_url, params=params)\n", " soup = BeautifulSoup(response.content, \"xml\")\n", " for result in soup.find_all(\"record\"):\n", " eac = result.find(\"eac-cpf\")\n", " # get id info here\n", " record = process_eac(eac)\n", " record[\"trove_url\"] = f\"https://nla.gov.au/nla.party-{record['record_id']}\"\n", " records.append(record)\n", " return records\n", "\n", "\n", "def harvest_results(params):\n", " records = []\n", " total = get_total_results(params.copy())\n", " start = 1\n", " with tqdm(total=total) as pbar:\n", " while start <= total:\n", " params[\"start\"] = start\n", " new_records = get_records(params)\n", " records += new_records\n", " start += 100\n", " pbar.update(len(new_records))\n", " return records" ] }, { "cell_type": "code", "execution_count": null, "id": "28db810d-3718-42ab-be57-b554298ef2c7", "metadata": {}, "outputs": [], "source": [ "search_params = params.copy()\n", "search_params[\"query\"] = \"wragge\"\n", "results = harvest_results(search_params)" ] }, { "cell_type": "code", "execution_count": null, "id": "e17a9b59-215d-4573-9315-467c3770c731", "metadata": {}, "outputs": [], "source": [ "JSON(results)" ] }, { "cell_type": "markdown", "id": "e0b8aa51-448f-449e-8c24-c60120660484", "metadata": {}, "source": [ "## Some testing" ] }, { "cell_type": "code", "execution_count": 85, "id": "94929bd8-5f3f-49a8-a19f-ca56a2d4d5a5", "metadata": { "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "# Test the processing code across the harvested data set\n", "with Path(\"peau-data.xml\").open(\"r\") as xml_file:\n", " for i, xml in enumerate(xml_file):\n", " # if i < 100000:\n", " soup = BeautifulSoup(xml, \"xml\")\n", " eac = soup.find(\"eac-cpf\")\n", " try:\n", " process_eac(eac)\n", " except AttributeError:\n", " print(soup.prettify())\n", " raise\n", " soup.decompose()" ] }, { "cell_type": "markdown", "id": "f01a0ce9-79fe-4890-a9bf-b3dab7796fcd", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherratt](http://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/).\n", "\n", "The development of this notebook was supported by the [Australian Cultural Data Engine](https://www.acd-engine.org/)." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { "0216f587e565489bac6edadf9285194a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "1e0249a9f7174690ab889d712b3092de": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "32834e43573d4b06a8ba596cec13ffed": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "54f28e4ce99a48338b2591e01422e4cd": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "bar_style": "success", "layout": "IPY_MODEL_b34ddc8c0c7c4f708f75857ee66c1656", "max": 78, "style": "IPY_MODEL_d01be516a6704d3d897d9a7170af0c69", "value": 78 } }, "84cc073a7d7e4f5c8f06b2592410cbff": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_32834e43573d4b06a8ba596cec13ffed", "style": "IPY_MODEL_1e0249a9f7174690ab889d712b3092de", "value": " 78/78 [00:05<00:00, 14.43it/s]" } }, "8a3ae70853074e38a094626de2734564": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "b34ddc8c0c7c4f708f75857ee66c1656": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "d01be516a6704d3d897d9a7170af0c69": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "description_width": "" } }, "e531f5153cb445bc8e6b073b1775202d": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "e710d361e8fb4fedaacdbad0128b299e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "children": [ "IPY_MODEL_e96a053b284d4f05bb8f0b9b86a811e9", "IPY_MODEL_54f28e4ce99a48338b2591e01422e4cd", "IPY_MODEL_84cc073a7d7e4f5c8f06b2592410cbff" ], "layout": "IPY_MODEL_0216f587e565489bac6edadf9285194a" } }, "e96a053b284d4f05bb8f0b9b86a811e9": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_e531f5153cb445bc8e6b073b1775202d", "style": "IPY_MODEL_8a3ae70853074e38a094626de2734564", "value": "100%" } } }, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }