# Sydney Stock Exchange – details by volume

In [1]:
import re
from pathlib import Path

import arrow
import pandas as pd
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display

In [2]:
ead_url = "https://archivescollection.anu.edu.au/index.php/or59j;ead?sf_format=xml"
cloudstor_url = "https://cloudstor.aarnet.edu.au/plus/s/i02k4gxeEpMAUkm"

In [3]:
# Get the list of dates
df_dates = pd.read_csv("complete_date_list.csv", parse_dates=["date"])
# Get the list of pages
df_pages = pd.read_csv("complete_page_list.csv", parse_dates=["date"])
# Get the list of volumes
df_vols = pd.read_csv("series_list.csv")
# Extract volume number
df_vols["vol_num"] = df_vols["Item_number"].str.extract(r"-(\d+)$").astype("int")

In [4]:
# Get the list of volumes from ATOM in XML
response = requests.get(ead_url)
soup = BeautifulSoup(response.text)

In [5]:
for vol in soup.find_all("c"):
    vol_num = int(re.search(r"-(\d+)$", vol.find("unitid").text).group(1).strip())
    atom_url = f"https://archivescollection.anu.edu.au/index.php/sydney-stock-exchange-stock-and-share-lists-{vol_num + 1}"

    # Get links to metadata and PDF in DSpace
    try:
        dspace_url = (
            re.search(r"(http.*)$", vol.find("altformavail").get_text())
            .group(1)
            .strip()
        )
    except AttributeError:
        dspace_url = None
        pdf_url = None
    else:
        pdf_url = f'{dspace_url.replace("handle", "bitstream")}/2/AU%20NBAC%20N193-{vol_num:03}.pdf'

    # Create link to download folder of images from Cludstor
    cloudstor_folder_url = (
        f"{cloudstor_url}/download?path=AU%20NBAC%20N193-{vol_num:03}"
    )

    # Get the list of pages from this volume and save as a CSV
    pages = df_pages.loc[df_pages["vol_num"] == vol_num].copy(deep=False)
    if not pages.empty:
        # Add a link to the page image on Cloudstor
        pages["image_url"] = pages.apply(
            lambda x: f'{cloudstor_folder_url}&files=N193-{int(vol_num):03}_{int(x["page_num"]):04}.jpg',
            axis=1,
        )
        csv_file = Path(f"volumes/vol-{vol_num}-pages.csv")
        pages.to_csv(csv_file, index=False)
    else:
        csv_file = None

    # Get the date range of this volume
    vol_details = df_vols.loc[df_vols["vol_num"] == vol_num]
    start_date = vol_details.iloc[0]["start_date"]
    end_date = vol_details.iloc[0]["end_date"]

    # Get list of dates covered by this volume
    dates = df_dates.loc[
        (df_dates["date"] >= start_date) & (df_dates["date"] <= end_date)
    ]

    # Get total page numbers
    num_pages = dates["pages"].sum()
    num_expected = dates["expected"].sum()

    # Display the volume details
    display(
        HTML(
            f'<h3>Volume {vol_num}: {arrow.get(start_date).format("D MMMM YYYY")} – {arrow.get(end_date).format("D MMMM YYYY")}</h3>'
        )
    )
    display(
        HTML(
            f"<p>Estimated {num_pages} of {num_expected} pages ({num_pages / num_expected:.2%} complete)</p>"
        )
    )
    links = f'<ul><li><a href="{atom_url}">View description in archive</a></li>'
    if dspace_url:
        links += f'<li><a href="{dspace_url}">View item in respository</a></li>'
        links += f'<li><a href="{pdf_url}">View PDF</a></li>'
    links += (
        f'<li><a href="{cloudstor_folder_url}">Download images from Cloudstor</a></li>'
    )
    if csv_file:
        links += f'<li><a href="{csv_file}" download="{csv_file}">Download page list as CSV</a></li>'
    links += "</ul>"
    display(HTML(links))

---
Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io/).