{ "cells": [ { "cell_type": "markdown", "id": "f5c8e011-d82e-463a-b534-993847e1061c", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "# Enrich the list of periodicals from the Trove API\n", "\n", "In [Periodicals from /magazine/titles endpoint](periodicals-from-api.ipynb) I harvested details of digitised periodicals and issues from the Trove API, removed duplicate titles, removed Parliamentary Papers, and tried to find missing issues. I noted that there were still some problems – in particular, some title links actually went to issues, and vice versa. This notebook tries to fix those problems and enriches the harvested data by extracting additional information from the website. It creates two datasets – one for titles and one for issues – and loads these into an SQLite database for use with Datasette Lite." ] }, { "cell_type": "code", "execution_count": 35, "id": "852f9a96-3f61-4a6e-85de-dd7c5757709a", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's import the libraries we need.\n", "import json\n", "import os\n", "import re\n", "import time\n", "from datetime import timedelta\n", "from pathlib import Path\n", "\n", "import arrow\n", "import pandas as pd\n", "import requests_cache\n", "from bs4 import BeautifulSoup\n", "from dotenv import load_dotenv\n", "from humanize import naturalsize\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "from slugify import slugify\n", "from sqlite_utils import Database\n", "\n", "# from slugify import slugify\n", "from tqdm.auto import tqdm\n", "\n", "s = requests_cache.CachedSession(expire_after=timedelta(days=30))\n", "retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n", "s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n", "s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n", "\n", "load_dotenv()" ] }, { "cell_type": "markdown", "id": "bcb50380-ff5d-4bb2-bd5e-c1958fb3a76b", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Check and enrich metadata" ] }, { "cell_type": "code", "execution_count": 3, "id": "22f142fb-4176-448e-ba62-74de6e858e2d", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "def get_metadata(id):\n", " \"\"\"\n", " Extract work data in a JSON string from the work's HTML page.\n", " \"\"\"\n", " if not id.startswith(\"http\"):\n", " id = \"https://nla.gov.au/\" + id\n", " response = s.get(id)\n", " try:\n", " work_data = re.search(\n", " r\"var work = JSON\\.parse\\(JSON\\.stringify\\((\\{.*\\})\", response.text\n", " ).group(1)\n", " except AttributeError:\n", " work_data = \"{}\"\n", " if not response.from_cache:\n", " time.sleep(0.2)\n", " return json.loads(work_data)\n", "\n", "\n", "def get_pages(work):\n", " \"\"\"\n", " Get the number of pages from the work metadata.\n", " \"\"\"\n", " try:\n", " pages = len(work[\"children\"][\"page\"])\n", " except KeyError:\n", " pages = 0\n", " return pages\n", "\n", "\n", "def get_title_ids():\n", " return [json.loads(t)[\"id\"] for t in open(\"titles-issues-added.ndjson\")]\n", "\n", "\n", "def get_iso_date(date):\n", " if date:\n", " iso_date = arrow.get(date, \"ddd, D MMM YYYY\").format(\"YYYY-MM-DD\")\n", " else:\n", " iso_date = \"\"\n", " return iso_date\n", "\n", "\n", "def get_issues(parent_id):\n", " \"\"\"\n", " Get the ids of issues that are children of the current record.\n", " \"\"\"\n", " start_url = \"https://nla.gov.au/{}/browse?startIdx={}&rows=20&op=c\"\n", " # The initial startIdx value\n", " start = 0\n", " # Number of results per page\n", " n = 20\n", " parts = []\n", " # If there aren't 20 results on the page then we've reached the end, so continue harvesting until that happens.\n", " while n == 20:\n", " # Get the browse page\n", " response = s.get(start_url.format(parent_id, start))\n", " # Beautifulsoup turns the HTML into an easily navigable structure\n", " soup = BeautifulSoup(response.text, \"lxml\")\n", " # Find all the divs containing issue details and loop through them\n", " details = soup.find_all(class_=\"l-item-info\")\n", " for detail in details:\n", " title = detail.find(\"h3\")\n", " if title:\n", " issue_id = title.parent[\"href\"].strip(\"/\")\n", " else:\n", " issue_id = detail.find(\"a\")[\"href\"].strip(\"/\")\n", " # Get the issue id\n", " parts.append(issue_id)\n", " if not response.from_cache:\n", " time.sleep(0.2)\n", " # Increment the startIdx\n", " start += n\n", " # Set n to the number of results on the current page\n", " n = len(details)\n", " return parts\n", "\n", "\n", "def prepare_title(metadata, issue_count=0):\n", " \"\"\"\n", " Create a periodical title record using metadata scraped from the digital viewer.\n", " \"\"\"\n", " title = {\n", " \"id\": metadata.get(\"pid\"),\n", " \"title\": metadata.get(\"title\"),\n", " \"description\": metadata.get(\"subUnitNo\", \"\"),\n", " \"extent\": metadata.get(\"extent\", \"\"),\n", " \"publisher\": metadata.get(\"publisherName\", \"\"),\n", " \"issue_count\": issue_count,\n", " }\n", " if bib_id := metadata.get(\"bibId\"):\n", " title[\"catalogue_url\"] = \"https://nla.gov.au/nla.cat-vn\" + bib_id\n", " return title\n", "\n", "\n", "def prepare_issue(title, metadata, pages):\n", " \"\"\"\n", " Create a periodical issue record using metadata scraped from the digital viewer.\n", " \"\"\"\n", " pid = metadata.get(\"pid\")\n", " issue = {\n", " \"id\": pid,\n", " \"title_id\": title.get(\"pid\"),\n", " \"title\": title.get(\"title\"),\n", " \"description\": metadata.get(\"subUnitNo\", \"\"),\n", " \"date\": get_iso_date(metadata.get(\"issueDate\", \"\")),\n", " \"url\": \"https://nla.gov.au/\" + pid,\n", " \"pages\": pages,\n", " }\n", " return issue\n", "\n", "\n", "def enrich_periodicals_data(\n", " input_file=\"titles-issues-added.ndjson\",\n", " output_titles=\"titles-enriched.ndjson\",\n", " output_issues=\"issues-enriched.ndjson\",\n", "):\n", " \"\"\"\n", " Work through all the titles and issues harvested from the API, checking that\n", " they are what they're supposed to be. Where titles are issues, or vice versa,\n", " try to add them to the right list.\n", "\n", " Also add extra metadata scraped from the digital viewer to title and issue records.\n", " \"\"\"\n", " # Prepare output files\n", " titles_ndjson = Path(output_titles)\n", " titles_ndjson.unlink(missing_ok=True)\n", " issues_ndjson = Path(output_issues)\n", " issues_ndjson.unlink(missing_ok=True)\n", "\n", " # Get a list of current title ids to check against\n", " title_ids = get_title_ids()\n", "\n", " total = sum(1 for _ in open(input_file))\n", "\n", " # Loop through current titles\n", " with Path(input_file).open(\"r\") as ndjson_in:\n", " for line in tqdm(ndjson_in, total=total):\n", " title_is_parent = True\n", " title = json.loads(line)\n", " # Scrape metadata from digital viewer\n", " title_metadata = get_metadata(title[\"id\"])\n", " # If this has pages then it's actually an issue\n", " # So we'll try and get issue info\n", " if title_pages := get_pages(title_metadata):\n", " # Does it have a parent title?\n", " parent_metadata = title_metadata.get(\"parent\")\n", " # If it does have a parent title and we don't have the title already,\n", " # save the title record\n", " if parent_metadata and parent_metadata[\"pid\"] not in title_ids:\n", " new_title = prepare_title(parent_metadata)\n", " with titles_ndjson.open(\"a\") as titles_out:\n", " titles_out.write(f\"{json.dumps(new_title)}\\n\")\n", " else:\n", " print(title)\n", " # Create an issue record\n", " new_issue = prepare_issue(parent_metadata, title_metadata, title_pages)\n", " with issues_ndjson.open(\"a\") as issues_out:\n", " issues_out.write(f\"{json.dumps(new_issue)}\\n\")\n", "\n", " # If it is really a title, we'll create a new title record that combines\n", " # the original record with the scraped metadata\n", " else:\n", " updated_title = title | prepare_title(title_metadata)\n", " # Clean up a few fields\n", " updated_title[\"issue_count\"] = updated_title.get(\"new_issue_count\", 0)\n", " updated_title.pop(\"new_issue_count\", None)\n", " updated_title.pop(\"unknown_dates\", None)\n", " issues = title.get(\"issues\", [])\n", " updated_title.pop(\"issues\", None)\n", "\n", " # Loop through the issues associated with this title\n", " with issues_ndjson.open(\"a\") as issues_out:\n", " for issue in issues:\n", " issue_metadata = get_metadata(issue[\"id\"])\n", " issue_pages = get_pages(issue_metadata)\n", " # If it doesn't have pages then it's probably a title\n", " if not issue_pages:\n", " # Try scraping a list of issues from the viewer\n", " parts = get_issues(issue[\"id\"])\n", " # If it has issues, then we'll treat it like a title\n", " if parts:\n", " title_is_parent = False\n", " # If we don't already have this as a title, then add it\n", " if issue[\"id\"] not in title_ids:\n", " new_title = prepare_title(issue_metadata, len(parts))\n", " with titles_ndjson.open(\"a\") as titles_out:\n", " titles_out.write(f\"{json.dumps(new_title)}\\n\")\n", " # Add all the issues belonging to this title\n", " for part in parts:\n", " part_metadata = get_metadata(part)\n", " part_pages = get_pages(part_metadata)\n", " new_issue = prepare_issue(\n", " issue_metadata, part_metadata, part_pages\n", " )\n", " issues_out.write(f\"{json.dumps(new_issue)}\\n\")\n", " # If it is an issue, create a new record that adds in the scraped metadata and number of pages\n", " else:\n", " updated_issue = issue | prepare_issue(\n", " title_metadata, issue_metadata, issue_pages\n", " )\n", " issues_out.write(f\"{json.dumps(updated_issue)}\\n\")\n", "\n", " # If it really is a title (and not an issue) write it to the dataset\n", " if title_is_parent:\n", " with titles_ndjson.open(\"a\") as titles_out:\n", " titles_out.write(f\"{json.dumps(updated_title)}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c2e0c178-ee24-45ce-94b8-d7f19ea1671f", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "enrich_periodicals_data()" ] }, { "cell_type": "markdown", "id": "28177bb2-9c64-4d92-8dca-bc69d59249ff", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Create issues dataset" ] }, { "cell_type": "code", "execution_count": 49, "id": "634b5e7c-2436-421c-b3ea-789aef643813", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "def save_issues(\n", " input_file=\"issues-enriched.ndjson\", output_file=\"periodical-issues.csv\"\n", "):\n", " df = pd.read_json(input_file, lines=True)\n", " df.drop_duplicates(inplace=True)\n", "\n", " # Remove where id is duplicated and id = title_id\n", " df = df.loc[~((df.duplicated(\"id\")) & (df[\"id\"] == df[\"title_id\"]))]\n", "\n", " def add_download_link(row):\n", " last_page = row[\"pages\"] - 1\n", " return f\"https://trove.nla.gov.au/{row['id']}/download?downloadOption=ocr&firstPage=0&lastPage={last_page}\"\n", "\n", " # Add a link to download the complete issue text\n", " df[\"text_download_url\"] = df.apply(add_download_link, axis=1)\n", "\n", " # Save as CSV\n", " df.sort_values([\"title\", \"date\"]).to_csv(output_file, index=False)\n", "\n", " # Add thumbnail details in JSON for Datasette\n", " df.insert(\n", " 0,\n", " \"thumbnail\",\n", " df[\"url\"].apply(\n", " lambda x: f'{{\"img_src\": \"{x + \"-t\"}\"}}' if not pd.isnull(x) else \"\"\n", " ),\n", " )\n", " return df" ] }, { "cell_type": "code", "execution_count": 50, "id": "1fe7f863-7d84-411d-bba8-f32c89bcdd45", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "df_issues = save_issues()" ] }, { "cell_type": "markdown", "id": "8d1d01f7-bf33-4b1f-a9f0-ea52e62cab34", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Create titles dataset" ] }, { "cell_type": "code", "execution_count": 71, "id": "27b4e7b4-62df-42df-85c7-46c219c2dbaa", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [], "source": [ "def merge_lists(column):\n", " try:\n", " return column.apply(lambda x: \"|\".join(x) if isinstance(x, list) else x)\n", " except AttributeError:\n", " return column\n", "\n", "\n", "def add_download_link(row, json=True):\n", " if row[\"issue_count\"] > 0:\n", " title = slugify(row[\"title\"])[:50]\n", " url = f\"https://trove-journals.s3.ap-southeast-2.amazonaws.com/{title}-{row['title_id']}.zip\"\n", " response = s.head(url)\n", " size = int(response.headers[\"Content-Length\"])\n", " return (\n", " f'{{\"href\": \"{url}\", \"label\": \"Download text ({naturalsize(size)} zip)\"}}'\n", " )\n", " return \"\"\n", "\n", "\n", "def save_titles(\n", " df_issues, input_file=\"titles-enriched.ndjson\", output_file=\"periodical-titles.csv\"\n", "):\n", " df = pd.read_json(input_file, lines=True)\n", " df = df.apply(merge_lists)\n", " df = df.sort_values(\"issue_count\").drop_duplicates([\"id\"], keep=\"last\")\n", "\n", " # Add thumbnail details from issues\n", " df = pd.merge(\n", " df,\n", " df_issues.sort_values(\"date\")\n", " .groupby(\"title_id\")\n", " .head(1)[[\"title_id\", \"thumbnail\"]],\n", " how=\"left\",\n", " left_on=\"id\",\n", " right_on=\"title_id\",\n", " )\n", " # Add a url that will search for articles in the periodical\n", " df[\"search_url\"] = df[\"id\"].apply(\n", " lambda x: f'{{\"href\": \"https://trove.nla.gov.au/search/category/magazines?keyword=%22{x}%22\", \"label\": \"Search for articles in Trove\"}}'\n", " )\n", "\n", " # Add a link to a zip file containing the OCRd text of this title\n", " df[\"download_text\"] = df.apply(add_download_link, axis=1)\n", "\n", " # Clean up column names\n", " df.rename(\n", " columns={\n", " \"troveUrl\": \"trove_url\",\n", " \"startDate\": \"start_date\",\n", " \"endDate\": \"end_date\",\n", " },\n", " inplace=True,\n", " )\n", "\n", " # Sort columns\n", " df = df[\n", " [\n", " \"thumbnail\",\n", " \"id\",\n", " \"title\",\n", " \"description\",\n", " \"publisher\",\n", " \"trove_url\",\n", " \"search_url\",\n", " \"download_text\",\n", " \"issue_count\",\n", " \"start_date\",\n", " \"end_date\",\n", " \"start_year\",\n", " \"end_year\",\n", " \"extent\",\n", " \"place\",\n", " \"issn\",\n", " \"catalogue_url\",\n", " ]\n", " ]\n", "\n", " # Make sure numbers are integers\n", " df[\"issue_count\"] = df[\"issue_count\"].astype(\"Int64\")\n", " df[\"start_year\"] = df[\"start_year\"].astype(\"Int64\")\n", " df[\"end_year\"] = df[\"end_year\"].astype(\"Int64\")\n", "\n", " # Save data to CSV\n", " df_csv = df.copy()\n", " # Extract the url for text downloads\n", " df_csv[\"download_text\"] = df[\"download_text\"].apply(\n", " lambda x: json.loads(x)[\"href\"] if x else \"\"\n", " )\n", " # Remove thumbnail and search_url and save as CSV\n", " df_csv.drop(columns=[\"thumbnail\", \"search_url\"]).sort_values(\"title\").to_csv(\n", " output_file, index=False\n", " )\n", "\n", " return df" ] }, { "cell_type": "code", "execution_count": 72, "id": "ba14710e-bd14-48a3-a245-8968fa42cbf1", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [ "nbval-skip" ] }, "outputs": [], "source": [ "df_titles = save_titles(df_issues)" ] }, { "cell_type": "markdown", "id": "79ecab23-19f1-4800-b32b-3d4a75b304e6", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [] }, "source": [ "## Create an SQLite database" ] }, { "cell_type": "code", "execution_count": 34, "id": "2b2e22ad-d4aa-402a-9d0a-4a4ed550285a", "metadata": { "editable": true, "slideshow": { "slide_type": "" }, "tags": [ "nbval-skip" ] }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "db = Database(\"periodicals.db\", recreate=True)\n", "\n", "db[\"titles\"].insert_all(df_titles.to_dict(orient=\"records\"), pk=\"id\")\n", "db[\"titles\"].enable_fts([\"title\", \"publisher\"])\n", "\n", "df_issues = df_issues.drop(\"title\", axis=1)\n", "db[\"issues\"].insert_all(df_issues.to_dict(orient=\"records\"), pk=\"id\")\n", "db[\"issues\"].add_foreign_key(\"title_id\", \"titles\", \"id\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "43a53c02-ed7f-4ff4-9580-c5224b0b9bec", "metadata": { "collapsed": true, "editable": true, "jupyter": { "outputs_hidden": true, "source_hidden": true }, "scrolled": true, "slideshow": { "slide_type": "" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4d255b28b1cc42c3910e7c5f18601786", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/10 [00:00