# Harvest SRU API results as JSON

You can query the People & Organisations data using the SRU (Search/Retrieve via URL) API. The easiest way to understand how to build SRU queries is to play around with the [online interface](http://www.nla.gov.au/apps/srw/search/peopleaustralia). More [information on the SRU protocol](https://www.loc.gov/standards/sru/) is available from the Library of Congress.

Trove's people and organisation records are available in a number of XML formats, the richest and most complex of which is [EAC-CPF](https://eac.staatsbibliothek-berlin.de/). However, the XML records are not easy to work with, so to simplify further processing, this notebook queries the SRU interface and then converts the XML results into JSON.

In [1]:
from pathlib import Path

import requests_cache
from bs4 import BeautifulSoup
from IPython.display import JSON
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

In [2]:
# Available SRU parameters

params = {
 # 'query': 'rec.identifier="http://nla.gov.au/nla.party-641680"', # Can specify a particular property, it not searches all (?) fields
 "query": "",
 "version": "1.1",
 "operation": "searchRetrieve",
 "recordSchema": "urn:isbn:1-931666-33-4", # This specifies records in EAC-CPF format
 "maximumRecords": 100,
 "startRecord": 1,
 "resultSetTTL": 300,
 "recordPacking": "xml",
 "recordXPath": "",
 "sortKeys": "",
}

# SRU endpoint
api_url = "http://www.nla.gov.au/apps/srw/search/peopleaustralia"

In [3]:
def get_total_results(params):
 params["maximumRecords"] = 0
 response = s.get(api_url, params=params)
 soup = BeautifulSoup(response.content, "xml")
 return int(soup.find("numberOfRecords").string)

In [4]:
def soup_string(elem, prop):
 """
 Saves on memory by not keeping BS navigable string
 """
 if value := elem.find(prop):
 string = str(value.string).strip()
 if string == "None":
 string = value.get_text()
 return string


def get_attr(elem, prop, attr):
 if value := elem.find(prop):
 return value.attrs.get(attr)


def get_date(elem, prop):
 try:
 date = elem.find(prop)["standardDateTime"]
 except (KeyError):
 try:
 date = elem.find(prop)["standardDate"]
 except KeyError:
 date = soup_string(elem, prop)
 except TypeError:
 date = None
 return date


def get_dates(history):
 dates = {}
 if history:
 for event in history.find_all("maintenanceEvent"):
 event_type = soup_string(event, "eventType")
 event_date = get_date(event, "eventDateTime")
 if event_type == "created":
 dates["date_created"] = event_date
 elif event_type == "updated":
 dates["date_modified"] = event_date
 return dates


def get_names(identity):
 names = []
 for name_entry in identity.find_all("nameEntry"):
 name = {}
 for part in name_entry.find_all("part"):
 if part.has_attr("localType"):
 name_type = part["localType"]
 else:
 name_type = "name"
 try:
 name[name_type].append(str(part.string))
 except (KeyError, AttributeError):
 name[name_type] = [str(part.string)]
 if name_entry.find("authorizedForm"):
 name["authorized"] = True
 else:
 name["authorized"] = False
 names.append(name)
 return names


def get_exist_dates(description):
 exist_dates = {}
 dates = description.find("existDates")
 if dates:
 exist_dates["date_from"] = get_date(dates, "fromDate")
 exist_dates["date_to"] = get_date(dates, "toDate")
 return exist_dates


def get_places(description):
 places = []
 places_elem = description.find("places")
 if places_elem:
 for place_entry in places_elem.find_all("place"):
 place = {
 "place_type": soup_string(place_entry, "placeRole"),
 "name": soup_string(place_entry, "placeEntry"),
 "date_from": get_date(place_entry, "fromDate"),
 "date_to": get_date(place_entry, "toDate"),
 }
 places.append(place)
 return places


def get_events(description):
 events = []
 for event_list in description.find_all("chronList"):
 for event in event_list.find_all("chronItem"):
 events.append(
 {
 "name": soup_string(event, "event"),
 "date": get_date(event, "date"),
 "date_from": get_date(event, "fromDate"),
 "date_to": get_date(event, "toDate"),
 }
 )
 return events


def get_occupations(description):
 occupations = []
 if occupation_list := description.find("occupations"):
 for occupation in occupation_list.find_all("occupation"):
 occupations.append(soup_string(occupation, "term"))
 return occupations


def get_related_entities(eac):
 related = []
 for relation in eac.find_all("cpfRelation"):
 # Can be resourceRelation or cpfRelation
 if description := relation.find("descriptiveNote"):
 description = description.get_text().strip()
 else:
 description = None
 related.append(
 {
 "relation_type": relation.attrs.get("cpfRelationType"),
 "href": relation.attrs.get("href"),
 "name": soup_string(relation, "relationEntry"),
 "entity_type": get_attr(relation, "relationEntry", "localType"),
 "date_from": get_date(relation, "fromDate"),
 "date_to": get_date(relation, "toDate"),
 "description": description,
 }
 )
 return related


def get_related_resources(eac):
 related = []
 for relation in eac.find_all("resourceRelation"):
 # Can be resourceRelation or cpfRelation
 relation_type = relation.attrs.get("resourceRelationType")
 if relation.find("dc"):
 if description := relation.find_all("description"):
 description = " ".join([d.get_text() for d in description])
 related.append(
 {
 "relation_type": relation_type,
 "href": soup_string(relation, "identifier"),
 "name": soup_string(relation, "title"),
 "resource_type": None,
 "contributor": soup_string(relation, "contributor"),
 "date": soup_string(relation, "date"),
 "description": description,
 }
 )
 else:
 if description := relation.find("abstract"):
 description = description.get_text()
 related.append(
 {
 "relation_type": relation_type,
 "href": relation.attrs.get("href"),
 "name": soup_string(relation, "relationEntry"),
 "resource_type": get_attr(relation, "relationEntry", "localType"),
 "contributor": soup_string(relation, "name"),
 "date": soup_string(relation, "date"),
 "description": description,
 }
 )
 return related


def get_biog(description):
 biog = []
 for bio in description.find_all("biogHist"):
 for para in bio.find_all("p"):
 biog.append(str(para.string).strip())
 return " ".join(biog)


def get_sources(eac):
 sources = []
 for source_eac in eac.find_all("eac-cpf"):
 source = process_eac(source_eac)
 source["related_entities"] = get_related_entities(source_eac)
 source["related_resources"] = get_related_resources(source_eac)
 sources.append(source)
 return sources


def get_agency_details(agency_element):
 agency = {
 "agency_id": soup_string(agency_element, "agencyCode"),
 "agency_name": soup_string(agency_element, "agencyName"),
 }
 return agency


def get_eac_meta(eac):
 meta = {"record_id": soup_string(eac, "recordId")}
 control = eac.find("control")
 # agency
 meta.update(get_agency_details(control.find("maintenanceAgency")))
 meta.update(get_dates(control.find("maintenanceHistory")))
 return meta


def format_name(names, entity_type):
 authorized = None
 combined_names = []
 for name in names:
 if name["authorized"] is True:
 authorized = name
 break
 if not authorized:
 try:
 authorized = names[0]
 except IndexError:
 pass
 if authorized:
 for name_type in ["forename", "surname", "name", "parent"]:
 combined_names += authorized.get(name_type, [])
 return " ".join(combined_names)


def process_eac(eac):
 record = get_eac_meta(eac)
 identity = eac.find("identity")
 record["names"] = get_names(identity)
 record["entity_type"] = soup_string(identity, "entityType")
 record["entity_id"] = soup_string(identity, "entityId")
 record["name"] = format_name(record["names"], record["entity_type"])
 description = eac.find("description")
 if not description:
 description = eac.find("cpfDescription")
 record["dates"] = get_exist_dates(description)
 record["places"] = get_places(description)
 record["occupations"] = get_occupations(description)
 record["abstract"] = soup_string(description, "abstract")
 record["description"] = get_biog(description)
 record["events"] = get_events(description)
 record["sources"] = get_sources(eac)
 return record


def get_records(params):
 records = []
 response = s.get(api_url, params=params)
 soup = BeautifulSoup(response.content, "xml")
 for result in soup.find_all("record"):
 eac = result.find("eac-cpf")
 # get id info here
 record = process_eac(eac)
 record["trove_url"] = f"https://nla.gov.au/nla.party-{record['record_id']}"
 records.append(record)
 return records


def harvest_results(params):
 records = []
 total = get_total_results(params.copy())
 start = 1
 with tqdm(total=total) as pbar:
 while start <= total:
 params["start"] = start
 new_records = get_records(params)
 records += new_records
 start += 100
 pbar.update(len(new_records))
 return records

In [None]:
search_params = params.copy()
search_params["query"] = "wragge"
results = harvest_results(search_params)

In [None]:
JSON(results)

## Some testing

In [85]:
# Test the processing code across the harvested data set
with Path("peau-data.xml").open("r") as xml_file:
 for i, xml in enumerate(xml_file):
 # if i < 100000:
 soup = BeautifulSoup(xml, "xml")
 eac = soup.find("eac-cpf")
 try:
 process_eac(eac)
 except AttributeError:
 print(soup.prettify())
 raise
 soup.decompose()

----

Created by [Tim Sherratt](http://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/).

The development of this notebook was supported by the [Australian Cultural Data Engine](https://www.acd-engine.org/).