{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get the page coordinates of a digitised newspaper article from Trove" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from IPython.display import display\n", "from IPython.display import Image as DisplayImage\n", "from bs4 import BeautifulSoup\n", "from PIL import Image, ImageDraw\n", "from io import BytesIO\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_box(zones):\n", " '''\n", " Loop through all the zones to find the outer limits of each boundary.\n", " Return a bounding box around the article.\n", " '''\n", " left = 10000\n", " right = 0\n", " top = 10000\n", " bottom = 0\n", " page_id = zones[0]['data-page-id']\n", " for zone in zones:\n", " if int(zone['data-y']) < top:\n", " top = int(zone['data-y'])\n", " if int(zone['data-x']) < left:\n", " left = int(zone['data-x'])\n", " if (int(zone['data-x']) + int(zone['data-w'])) > right:\n", " right = int(zone['data-x']) + int(zone['data-w'])\n", " if (int(zone['data-y']) + int(zone['data-h'])) > bottom:\n", " bottom = int(zone['data-y']) + int(zone['data-h'])\n", " return {'page_id': page_id, 'left': left, 'top': top, 'right': right, 'bottom': bottom}\n", " \n", "def get_article_boxes(article_url):\n", " '''\n", " Positional information about the article is attached to each block of the OCR output in data attributes.\n", " This function loads the HTML version of the article and scrapes the x, y, and width values for each block of text \n", " to determine the coordinates of a box around the article.\n", " '''\n", " boxes = []\n", " # Get the article page\n", " response = requests.get(article_url)\n", " # Load in BS4\n", " soup = BeautifulSoup(response.text, 'lxml')\n", " # Lines of OCR are in divs with the class 'zone'\n", " # 'onPage' limits to those on the current page\n", " zones = soup.select('div.zone.onPage')\n", " boxes.append(get_box(zones))\n", " off_page_zones = soup.select('div.zone.offPage')\n", " if off_page_zones:\n", " current_page = off_page_zones[0]['data-page-id']\n", " zones = []\n", " for zone in off_page_zones:\n", " if zone['data-page-id'] == current_page:\n", " zones.append(zone)\n", " else:\n", " boxes.append(get_box(zones))\n", " zones = [zone]\n", " current_page = zone['data-page-id']\n", " boxes.append(get_box(zones)) \n", " return boxes\n", "\n", "def display_boxes(boxes):\n", " for box in boxes:\n", " # Construct the url we need to download the page image\n", " page_url = 'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(box['page_id'], 7)\n", " # Download the page image\n", " response = requests.get(page_url)\n", " # Open download as an image for editing\n", " img = Image.open(BytesIO(response.content))\n", " img = img.convert(mode='RGB')\n", " draw = ImageDraw.Draw(img)\n", " # Draw a rectangle on the image\n", " draw.rectangle([(box['left'], box['top']), (box['right'], box['bottom'])], outline=(0,255,0), width=20)\n", " buffer = BytesIO()\n", " img.save(buffer, format='JPEG')\n", " display(DisplayImage(data=buffer.getvalue(), width=400))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "boxes = get_article_boxes('https://trove.nla.gov.au/newspaper/article/258166628?searchTerm=wragge')\n", "print(boxes)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "display_boxes(boxes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## What can I do with this?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the GLAM Workbench there's a notebook (and app) to save an article as an image using the code above. But what about building something like this into a pipeline to assemble a dataset of images? Perhaps illustrated advertisements by decade, or by product type, of from the *Australian Women's Weekly*? A collection of weather maps?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). \n", "Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }