{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "269f718f-3a08-4958-8a66-12918e51a3ec",
   "metadata": {},
   "source": [
    "# Harvest SRU API results as JSON\n",
    "\n",
    "You can query the People & Organisations data using the SRU (Search/Retrieve via URL) API. The easiest way to understand how to build SRU queries is to play around with the [online interface](http://www.nla.gov.au/apps/srw/search/peopleaustralia). More [information on the SRU protocol](https://www.loc.gov/standards/sru/) is available from the Library of Congress.\n",
    "\n",
    "Trove's people and organisation records are available in a number of XML formats, the richest and most complex of which is [EAC-CPF](https://eac.staatsbibliothek-berlin.de/). However, the XML records are not easy to work with, so to simplify further processing, this notebook queries the SRU interface and then converts the XML results into JSON."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c60c39de-30e9-4144-8af0-a8d1e68ac385",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "import requests_cache\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import JSON\n",
    "from requests.adapters import HTTPAdapter\n",
    "from requests.packages.urllib3.util.retry import Retry\n",
    "from tqdm.auto import tqdm\n",
    "\n",
    "s = requests_cache.CachedSession()\n",
    "retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n",
    "s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
    "s.mount(\"http://\", HTTPAdapter(max_retries=retries))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "36a093b9-0709-4855-bdb6-79ec679f32c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Available SRU parameters\n",
    "\n",
    "params = {\n",
    "    # 'query': 'rec.identifier=\"http://nla.gov.au/nla.party-641680\"', # Can specify a particular property, it not searches all (?) fields\n",
    "    \"query\": \"\",\n",
    "    \"version\": \"1.1\",\n",
    "    \"operation\": \"searchRetrieve\",\n",
    "    \"recordSchema\": \"urn:isbn:1-931666-33-4\",  # This specifies records in EAC-CPF format\n",
    "    \"maximumRecords\": 100,\n",
    "    \"startRecord\": 1,\n",
    "    \"resultSetTTL\": 300,\n",
    "    \"recordPacking\": \"xml\",\n",
    "    \"recordXPath\": \"\",\n",
    "    \"sortKeys\": \"\",\n",
    "}\n",
    "\n",
    "# SRU endpoint\n",
    "api_url = \"http://www.nla.gov.au/apps/srw/search/peopleaustralia\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "48c959ef-681d-4ac8-8b22-da03d25da5ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_total_results(params):\n",
    "    params[\"maximumRecords\"] = 0\n",
    "    response = s.get(api_url, params=params)\n",
    "    soup = BeautifulSoup(response.content, \"xml\")\n",
    "    return int(soup.find(\"numberOfRecords\").string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9c006708-34f0-48f9-af50-3cfe59075ebe",
   "metadata": {},
   "outputs": [],
   "source": [
    "def soup_string(elem, prop):\n",
    "    \"\"\"\n",
    "    Saves on memory by not keeping BS navigable string\n",
    "    \"\"\"\n",
    "    if value := elem.find(prop):\n",
    "        string = str(value.string).strip()\n",
    "        if string == \"None\":\n",
    "            string = value.get_text()\n",
    "        return string\n",
    "\n",
    "\n",
    "def get_attr(elem, prop, attr):\n",
    "    if value := elem.find(prop):\n",
    "        return value.attrs.get(attr)\n",
    "\n",
    "\n",
    "def get_date(elem, prop):\n",
    "    try:\n",
    "        date = elem.find(prop)[\"standardDateTime\"]\n",
    "    except (KeyError):\n",
    "        try:\n",
    "            date = elem.find(prop)[\"standardDate\"]\n",
    "        except KeyError:\n",
    "            date = soup_string(elem, prop)\n",
    "    except TypeError:\n",
    "        date = None\n",
    "    return date\n",
    "\n",
    "\n",
    "def get_dates(history):\n",
    "    dates = {}\n",
    "    if history:\n",
    "        for event in history.find_all(\"maintenanceEvent\"):\n",
    "            event_type = soup_string(event, \"eventType\")\n",
    "            event_date = get_date(event, \"eventDateTime\")\n",
    "            if event_type == \"created\":\n",
    "                dates[\"date_created\"] = event_date\n",
    "            elif event_type == \"updated\":\n",
    "                dates[\"date_modified\"] = event_date\n",
    "    return dates\n",
    "\n",
    "\n",
    "def get_names(identity):\n",
    "    names = []\n",
    "    for name_entry in identity.find_all(\"nameEntry\"):\n",
    "        name = {}\n",
    "        for part in name_entry.find_all(\"part\"):\n",
    "            if part.has_attr(\"localType\"):\n",
    "                name_type = part[\"localType\"]\n",
    "            else:\n",
    "                name_type = \"name\"\n",
    "            try:\n",
    "                name[name_type].append(str(part.string))\n",
    "            except (KeyError, AttributeError):\n",
    "                name[name_type] = [str(part.string)]\n",
    "        if name_entry.find(\"authorizedForm\"):\n",
    "            name[\"authorized\"] = True\n",
    "        else:\n",
    "            name[\"authorized\"] = False\n",
    "        names.append(name)\n",
    "    return names\n",
    "\n",
    "\n",
    "def get_exist_dates(description):\n",
    "    exist_dates = {}\n",
    "    dates = description.find(\"existDates\")\n",
    "    if dates:\n",
    "        exist_dates[\"date_from\"] = get_date(dates, \"fromDate\")\n",
    "        exist_dates[\"date_to\"] = get_date(dates, \"toDate\")\n",
    "    return exist_dates\n",
    "\n",
    "\n",
    "def get_places(description):\n",
    "    places = []\n",
    "    places_elem = description.find(\"places\")\n",
    "    if places_elem:\n",
    "        for place_entry in places_elem.find_all(\"place\"):\n",
    "            place = {\n",
    "                \"place_type\": soup_string(place_entry, \"placeRole\"),\n",
    "                \"name\": soup_string(place_entry, \"placeEntry\"),\n",
    "                \"date_from\": get_date(place_entry, \"fromDate\"),\n",
    "                \"date_to\": get_date(place_entry, \"toDate\"),\n",
    "            }\n",
    "            places.append(place)\n",
    "    return places\n",
    "\n",
    "\n",
    "def get_events(description):\n",
    "    events = []\n",
    "    for event_list in description.find_all(\"chronList\"):\n",
    "        for event in event_list.find_all(\"chronItem\"):\n",
    "            events.append(\n",
    "                {\n",
    "                    \"name\": soup_string(event, \"event\"),\n",
    "                    \"date\": get_date(event, \"date\"),\n",
    "                    \"date_from\": get_date(event, \"fromDate\"),\n",
    "                    \"date_to\": get_date(event, \"toDate\"),\n",
    "                }\n",
    "            )\n",
    "    return events\n",
    "\n",
    "\n",
    "def get_occupations(description):\n",
    "    occupations = []\n",
    "    if occupation_list := description.find(\"occupations\"):\n",
    "        for occupation in occupation_list.find_all(\"occupation\"):\n",
    "            occupations.append(soup_string(occupation, \"term\"))\n",
    "    return occupations\n",
    "\n",
    "\n",
    "def get_related_entities(eac):\n",
    "    related = []\n",
    "    for relation in eac.find_all(\"cpfRelation\"):\n",
    "        # Can be resourceRelation or cpfRelation\n",
    "        if description := relation.find(\"descriptiveNote\"):\n",
    "            description = description.get_text().strip()\n",
    "        else:\n",
    "            description = None\n",
    "        related.append(\n",
    "            {\n",
    "                \"relation_type\": relation.attrs.get(\"cpfRelationType\"),\n",
    "                \"href\": relation.attrs.get(\"href\"),\n",
    "                \"name\": soup_string(relation, \"relationEntry\"),\n",
    "                \"entity_type\": get_attr(relation, \"relationEntry\", \"localType\"),\n",
    "                \"date_from\": get_date(relation, \"fromDate\"),\n",
    "                \"date_to\": get_date(relation, \"toDate\"),\n",
    "                \"description\": description,\n",
    "            }\n",
    "        )\n",
    "    return related\n",
    "\n",
    "\n",
    "def get_related_resources(eac):\n",
    "    related = []\n",
    "    for relation in eac.find_all(\"resourceRelation\"):\n",
    "        # Can be resourceRelation or cpfRelation\n",
    "        relation_type = relation.attrs.get(\"resourceRelationType\")\n",
    "        if relation.find(\"dc\"):\n",
    "            if description := relation.find_all(\"description\"):\n",
    "                description = \" \".join([d.get_text() for d in description])\n",
    "            related.append(\n",
    "                {\n",
    "                    \"relation_type\": relation_type,\n",
    "                    \"href\": soup_string(relation, \"identifier\"),\n",
    "                    \"name\": soup_string(relation, \"title\"),\n",
    "                    \"resource_type\": None,\n",
    "                    \"contributor\": soup_string(relation, \"contributor\"),\n",
    "                    \"date\": soup_string(relation, \"date\"),\n",
    "                    \"description\": description,\n",
    "                }\n",
    "            )\n",
    "        else:\n",
    "            if description := relation.find(\"abstract\"):\n",
    "                description = description.get_text()\n",
    "            related.append(\n",
    "                {\n",
    "                    \"relation_type\": relation_type,\n",
    "                    \"href\": relation.attrs.get(\"href\"),\n",
    "                    \"name\": soup_string(relation, \"relationEntry\"),\n",
    "                    \"resource_type\": get_attr(relation, \"relationEntry\", \"localType\"),\n",
    "                    \"contributor\": soup_string(relation, \"name\"),\n",
    "                    \"date\": soup_string(relation, \"date\"),\n",
    "                    \"description\": description,\n",
    "                }\n",
    "            )\n",
    "    return related\n",
    "\n",
    "\n",
    "def get_biog(description):\n",
    "    biog = []\n",
    "    for bio in description.find_all(\"biogHist\"):\n",
    "        for para in bio.find_all(\"p\"):\n",
    "            biog.append(str(para.string).strip())\n",
    "    return \" \".join(biog)\n",
    "\n",
    "\n",
    "def get_sources(eac):\n",
    "    sources = []\n",
    "    for source_eac in eac.find_all(\"eac-cpf\"):\n",
    "        source = process_eac(source_eac)\n",
    "        source[\"related_entities\"] = get_related_entities(source_eac)\n",
    "        source[\"related_resources\"] = get_related_resources(source_eac)\n",
    "        sources.append(source)\n",
    "    return sources\n",
    "\n",
    "\n",
    "def get_agency_details(agency_element):\n",
    "    agency = {\n",
    "        \"agency_id\": soup_string(agency_element, \"agencyCode\"),\n",
    "        \"agency_name\": soup_string(agency_element, \"agencyName\"),\n",
    "    }\n",
    "    return agency\n",
    "\n",
    "\n",
    "def get_eac_meta(eac):\n",
    "    meta = {\"record_id\": soup_string(eac, \"recordId\")}\n",
    "    control = eac.find(\"control\")\n",
    "    # agency\n",
    "    meta.update(get_agency_details(control.find(\"maintenanceAgency\")))\n",
    "    meta.update(get_dates(control.find(\"maintenanceHistory\")))\n",
    "    return meta\n",
    "\n",
    "\n",
    "def format_name(names, entity_type):\n",
    "    authorized = None\n",
    "    combined_names = []\n",
    "    for name in names:\n",
    "        if name[\"authorized\"] is True:\n",
    "            authorized = name\n",
    "            break\n",
    "    if not authorized:\n",
    "        try:\n",
    "            authorized = names[0]\n",
    "        except IndexError:\n",
    "            pass\n",
    "    if authorized:\n",
    "        for name_type in [\"forename\", \"surname\", \"name\", \"parent\"]:\n",
    "            combined_names += authorized.get(name_type, [])\n",
    "    return \" \".join(combined_names)\n",
    "\n",
    "\n",
    "def process_eac(eac):\n",
    "    record = get_eac_meta(eac)\n",
    "    identity = eac.find(\"identity\")\n",
    "    record[\"names\"] = get_names(identity)\n",
    "    record[\"entity_type\"] = soup_string(identity, \"entityType\")\n",
    "    record[\"entity_id\"] = soup_string(identity, \"entityId\")\n",
    "    record[\"name\"] = format_name(record[\"names\"], record[\"entity_type\"])\n",
    "    description = eac.find(\"description\")\n",
    "    if not description:\n",
    "        description = eac.find(\"cpfDescription\")\n",
    "    record[\"dates\"] = get_exist_dates(description)\n",
    "    record[\"places\"] = get_places(description)\n",
    "    record[\"occupations\"] = get_occupations(description)\n",
    "    record[\"abstract\"] = soup_string(description, \"abstract\")\n",
    "    record[\"description\"] = get_biog(description)\n",
    "    record[\"events\"] = get_events(description)\n",
    "    record[\"sources\"] = get_sources(eac)\n",
    "    return record\n",
    "\n",
    "\n",
    "def get_records(params):\n",
    "    records = []\n",
    "    response = s.get(api_url, params=params)\n",
    "    soup = BeautifulSoup(response.content, \"xml\")\n",
    "    for result in soup.find_all(\"record\"):\n",
    "        eac = result.find(\"eac-cpf\")\n",
    "        # get id info here\n",
    "        record = process_eac(eac)\n",
    "        record[\"trove_url\"] = f\"https://nla.gov.au/nla.party-{record['record_id']}\"\n",
    "        records.append(record)\n",
    "    return records\n",
    "\n",
    "\n",
    "def harvest_results(params):\n",
    "    records = []\n",
    "    total = get_total_results(params.copy())\n",
    "    start = 1\n",
    "    with tqdm(total=total) as pbar:\n",
    "        while start <= total:\n",
    "            params[\"start\"] = start\n",
    "            new_records = get_records(params)\n",
    "            records += new_records\n",
    "            start += 100\n",
    "            pbar.update(len(new_records))\n",
    "    return records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28db810d-3718-42ab-be57-b554298ef2c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "search_params = params.copy()\n",
    "search_params[\"query\"] = \"wragge\"\n",
    "results = harvest_results(search_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e17a9b59-215d-4573-9315-467c3770c731",
   "metadata": {},
   "outputs": [],
   "source": [
    "JSON(results)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e0b8aa51-448f-449e-8c24-c60120660484",
   "metadata": {},
   "source": [
    "## Some testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "94929bd8-5f3f-49a8-a19f-ca56a2d4d5a5",
   "metadata": {
    "tags": [
     "nbval-skip"
    ]
   },
   "outputs": [],
   "source": [
    "# Test the processing code across the harvested data set\n",
    "with Path(\"peau-data.xml\").open(\"r\") as xml_file:\n",
    "    for i, xml in enumerate(xml_file):\n",
    "        # if i < 100000:\n",
    "        soup = BeautifulSoup(xml, \"xml\")\n",
    "        eac = soup.find(\"eac-cpf\")\n",
    "        try:\n",
    "            process_eac(eac)\n",
    "        except AttributeError:\n",
    "            print(soup.prettify())\n",
    "            raise\n",
    "        soup.decompose()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f01a0ce9-79fe-4890-a9bf-b3dab7796fcd",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Created by [Tim Sherratt](http://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/).\n",
    "\n",
    "The development of this notebook was supported by the [Australian Cultural Data Engine](https://www.acd-engine.org/)."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {
     "0216f587e565489bac6edadf9285194a": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "1e0249a9f7174690ab889d712b3092de": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "32834e43573d4b06a8ba596cec13ffed": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "54f28e4ce99a48338b2591e01422e4cd": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "FloatProgressModel",
      "state": {
       "bar_style": "success",
       "layout": "IPY_MODEL_b34ddc8c0c7c4f708f75857ee66c1656",
       "max": 78,
       "style": "IPY_MODEL_d01be516a6704d3d897d9a7170af0c69",
       "value": 78
      }
     },
     "84cc073a7d7e4f5c8f06b2592410cbff": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_32834e43573d4b06a8ba596cec13ffed",
       "style": "IPY_MODEL_1e0249a9f7174690ab889d712b3092de",
       "value": " 78/78 [00:05&lt;00:00, 14.43it/s]"
      }
     },
     "8a3ae70853074e38a094626de2734564": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "DescriptionStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "b34ddc8c0c7c4f708f75857ee66c1656": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "d01be516a6704d3d897d9a7170af0c69": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "ProgressStyleModel",
      "state": {
       "description_width": ""
      }
     },
     "e531f5153cb445bc8e6b073b1775202d": {
      "model_module": "@jupyter-widgets/base",
      "model_module_version": "1.2.0",
      "model_name": "LayoutModel",
      "state": {}
     },
     "e710d361e8fb4fedaacdbad0128b299e": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HBoxModel",
      "state": {
       "children": [
        "IPY_MODEL_e96a053b284d4f05bb8f0b9b86a811e9",
        "IPY_MODEL_54f28e4ce99a48338b2591e01422e4cd",
        "IPY_MODEL_84cc073a7d7e4f5c8f06b2592410cbff"
       ],
       "layout": "IPY_MODEL_0216f587e565489bac6edadf9285194a"
      }
     },
     "e96a053b284d4f05bb8f0b9b86a811e9": {
      "model_module": "@jupyter-widgets/controls",
      "model_module_version": "1.5.0",
      "model_name": "HTMLModel",
      "state": {
       "layout": "IPY_MODEL_e531f5153cb445bc8e6b073b1775202d",
       "style": "IPY_MODEL_8a3ae70853074e38a094626de2734564",
       "value": "100%"
      }
     }
    },
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}