{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "# Create large composite images from snipped words\n",
    "\n",
    "This is a variation of the ['scissors & paste' notebook](trove-newspapers-scissors-and-paste.ipynb) that extracts words from Trove newspaper images and compiles them into messages. In this notebook, you can harvest multiple versions of a list of words and compile them all into one big image.\n",
    "\n",
    "![Slice of composite image](images/trove_words-cropped.jpg)\n",
    "\n",
    "[View high-res version](https://www.dropbox.com/s/6bmzr9lfvptcaqt/trove_words.jpg?dl=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Import what we need\n",
    "import os\n",
    "import random\n",
    "import shutil\n",
    "import time\n",
    "from datetime import datetime\n",
    "from io import BytesIO\n",
    "from pathlib import Path\n",
    "\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from dotenv import load_dotenv\n",
    "from IPython.display import FileLink, display\n",
    "from PIL import Image, ImageOps\n",
    "from rectpack import SORT_NONE, newPacker\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Insert your Trove API key\n",
    "API_KEY = \"YOUR API KEY\"\n",
    "\n",
    "# Use api key value from environment variables if it is available\n",
    "if os.getenv(\"TROVE_API_KEY\"):\n",
    "    API_KEY = os.getenv(\"TROVE_API_KEY\")\n",
    "\n",
    "HEADERS = {\"X-API-KEY\": API_KEY}\n",
    "\n",
    "# List of words you want to harvest\n",
    "WORD_LIST = [\n",
    "    \"newspaper\",\n",
    "    \"book\",\n",
    "    \"magazine\",\n",
    "    \"journal\",\n",
    "    \"picture\",\n",
    "    \"data\",\n",
    "    \"music\",\n",
    "    \"map\",\n",
    "    \"people\",\n",
    "    \"discover\",\n",
    "    \"explore\",\n",
    "    \"web\",\n",
    "    \"research\",\n",
    "    \"create\",\n",
    "    \"article\",\n",
    "    \"history\",\n",
    "]\n",
    "\n",
    "# Max number of images of each word you want to harvest (sometimes the words can't be found in the article, so the actual number will probably be a little less)\n",
    "NUM_WORDS = 50\n",
    "\n",
    "# Where to save the images\n",
    "IMG_DIR = \"words\"\n",
    "\n",
    "# Create the output directory\n",
    "Path(IMG_DIR).mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def get_word_boxes(article_url):\n",
    "    \"\"\"\n",
    "    Get the boxes around highlighted search terms.\n",
    "    \"\"\"\n",
    "    boxes = []\n",
    "    # Get the article page\n",
    "    response = requests.get(article_url)\n",
    "    # Load in BS4\n",
    "    soup = BeautifulSoup(response.text, \"lxml\")\n",
    "    # Get the id of the newspaper page\n",
    "    page_id = soup.select(\"div.zone.onPage\")[0][\"data-page-id\"]\n",
    "    # Find the highlighted terms\n",
    "    words = soup.select(\"span.highlightedTerm\")\n",
    "    # Save the box coords\n",
    "    for word in words:\n",
    "        box = {\n",
    "            \"page_id\": page_id,\n",
    "            \"left\": int(word[\"data-x\"]),\n",
    "            \"top\": int(word[\"data-y\"]),\n",
    "            \"width\": int(word[\"data-w\"]),\n",
    "            \"height\": int(word[\"data-h\"]),\n",
    "        }\n",
    "        boxes.append(box)\n",
    "    return boxes\n",
    "\n",
    "\n",
    "def crop_word(box, kw, article_id):\n",
    "    \"\"\"\n",
    "    Crop the box coordinates from the full page image.\n",
    "    \"\"\"\n",
    "    word_path = Path(f\"{IMG_DIR}/{kw}-{article_id}.jpg\")\n",
    "    if not word_path.exists():\n",
    "        # Construct the url we need to download the page image\n",
    "        page_url = (\n",
    "            \"https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}\".format(\n",
    "                box[\"page_id\"], 7\n",
    "            )\n",
    "        )\n",
    "        # print(page_url)\n",
    "        # Download the page image\n",
    "        response = requests.get(page_url)\n",
    "        # Open download as an image for editing\n",
    "        img = Image.open(BytesIO(response.content))\n",
    "        word = img.crop(\n",
    "            (\n",
    "                box[\"left\"] - 5,\n",
    "                box[\"top\"] - 5,\n",
    "                box[\"left\"] + box[\"width\"] + 5,\n",
    "                box[\"top\"] + box[\"height\"] + 5,\n",
    "            )\n",
    "        )\n",
    "        img.close()\n",
    "        word.save(word_path)\n",
    "\n",
    "\n",
    "def get_article_from_search(kw):\n",
    "    \"\"\"\n",
    "    Use the Trove API to find articles with the supplied keyword.\n",
    "    \"\"\"\n",
    "    params = {\n",
    "        \"q\": f'text:\"{kw}\"',\n",
    "        \"category\": \"newspaper\",\n",
    "        \"l-artType\": \"newspaper\",\n",
    "        \"encoding\": \"json\",\n",
    "        \"n\": NUM_WORDS,\n",
    "    }\n",
    "    response = requests.get(\n",
    "        \"https://api.trove.nla.gov.au/v3/result\", params=params, headers=HEADERS\n",
    "    )\n",
    "    data = response.json()\n",
    "    articles = data[\"category\"][0][\"records\"][\"article\"]\n",
    "    for article in articles:\n",
    "        boxes = []\n",
    "        try:\n",
    "            boxes = get_word_boxes(article[\"troveUrl\"])\n",
    "        except KeyError:\n",
    "            pass\n",
    "        if boxes:\n",
    "            crop_word(boxes[0], kw, article[\"id\"])\n",
    "        time.sleep(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "## Get all the words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "nbval-skip"
    ]
   },
   "outputs": [],
   "source": [
    "for word in WORD_LIST:\n",
    "    get_article_from_search(word)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "## Create the composite image\n",
    "\n",
    "Here we use a packing algorithm to try and fit the little word images (which are a variety of shapes and sizes) into one big box with as few gaps as possible. Adjust the `WIDTH` and `HEIGHT` values below to change the size of the composite."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Set width of composite image\n",
    "WIDTH = 1000\n",
    "\n",
    "# Set height of composite image\n",
    "HEIGHT = 1000\n",
    "\n",
    "# Set background colour of composite image\n",
    "BG_COLOUR = (0, 0, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def get_image_data():\n",
    "    images = []\n",
    "    for im in [i for i in Path(IMG_DIR).glob(\"*.jpg\")]:\n",
    "        img = Image.open(im)\n",
    "        h, w = img.size\n",
    "        images.append((h + 4, w + 4, im.name))\n",
    "    return images\n",
    "\n",
    "\n",
    "def pack_images():\n",
    "    images = get_image_data()\n",
    "    random.shuffle(images)\n",
    "    packer = newPacker(sort_algo=SORT_NONE, rotation=False)\n",
    "    for i in images:\n",
    "        packer.add_rect(*i)\n",
    "    packer.add_bin(WIDTH, HEIGHT)\n",
    "    packer.pack()\n",
    "    return len(images), packer.rect_list()\n",
    "\n",
    "\n",
    "def create_composite(output_file=None):\n",
    "    num_images, rectangles = pack_images()\n",
    "    comp = Image.new(\"RGB\", (WIDTH, HEIGHT), BG_COLOUR)\n",
    "    for rect in rectangles:\n",
    "        b, x, y, w, h, rid = rect\n",
    "        # print(x,y, w, h, rid)\n",
    "        word_path = Path(IMG_DIR, rid)\n",
    "        word = Image.open(word_path)\n",
    "        word = word.convert(\"RGB\")\n",
    "        word_with_border = ImageOps.expand(word, border=2, fill=BG_COLOUR)\n",
    "        comp.paste(word_with_border, (x, y, x + w, y + h))\n",
    "    if not output_file:\n",
    "        output_file = (\n",
    "            f\"trove-words-{int(datetime.now().timestamp())}-{WIDTH}-{HEIGHT}.jpg\"\n",
    "        )\n",
    "    comp.save(output_file)\n",
    "    print(f\"{len(rectangles)} of {num_images} images used\")\n",
    "    display(FileLink(output_file))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "Run the cell below to create a composite image of the words you've harvested. The function will tell you how many of the harvested words it was able to fit into the composite. You can adjust the width and height of the composite to fit in more, or fill up gaps."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": [
     "nbval-skip"
    ]
   },
   "outputs": [],
   "source": [
    "create_composite()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# FOR TESTING ONLY -- IGNORE THIS CELL\n",
    "if os.getenv(\"GW_STATUS\") == \"dev\":\n",
    "    IMG_DIR = \"test\"\n",
    "    NUM_WORDS = 2\n",
    "    Path(IMG_DIR).mkdir(parents=True, exist_ok=True)\n",
    "    WORD_LIST = [\"annoyed\", \"angry\"]\n",
    "    for word in WORD_LIST:\n",
    "        get_article_from_search(word)\n",
    "    create_composite()\n",
    "    shutil.rmtree(Path(IMG_DIR))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "editable": true,
    "slideshow": {
     "slide_type": ""
    },
    "tags": []
   },
   "source": [
    "----\n",
    "\n",
    "Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/).  \n",
    "Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "rocrate": {
   "author": [
    {
     "mainEntityOfPage": "https://timsherratt.au",
     "name": "Sherratt, Tim",
     "orcid": "https://orcid.org/0000-0001-7956-4498"
    }
   ],
   "description": "This is a variation of the 'scissors & paste' notebook that extracts words from Trove newspaper images and compiles them into messages. In this notebook, you can harvest multiple versions of a list of words and compile them all into one big image.",
   "mainEntityOfPage": "https://glam-workbench.net/trove-newspapers/trove-newspapers-create-composite-from-words/",
   "name": "Create large composite images from snipped words"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}