{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Download the contents of a digitised file from the National Archives of Australia\n", "\n", "**Why?** RecordSearch lets you download a PDF of a digitised file, but sometimes it's more convenient to work with individual images.\n", "\n", "**How?** Just enter the barcode of the file in the box below and click the button. When all the images have been downloaded they'll be zipped up, and a convenient download link will be displayed.\n", "\n", "**More?** Click the 'Edit App' button at the top of the page to see how this works." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import shutil\n", "import time\n", "\n", "import ipywidgets as widgets\n", "import requests\n", "from IPython.display import HTML, display\n", "from recordsearch_data_scraper.scrapers import RSItem\n", "from slugify import slugify\n", "from tqdm.auto import tqdm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_images(b):\n", " with out:\n", " if barcode.value:\n", " item = RSItem(barcode.value).data\n", " if item[\"digitised_pages\"] > 0:\n", " series = slugify(item[\"series\"])\n", " control = slugify(item[\"control_symbol\"])\n", " dir_name = f'{series}-{control}-[{item[\"identifier\"]}]'\n", " dir_path = os.path.join(\"data\", \"images\", dir_name)\n", " if not os.path.exists(dir_path):\n", " os.makedirs(dir_path)\n", " for page in tqdm(range(1, item[\"digitised_pages\"] + 1)):\n", " filename = \"{}/{}-p{}.jpg\".format(\n", " dir_path, item[\"identifier\"], page\n", " )\n", " if not os.path.exists(filename):\n", " img_url = \"https://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P\".format(\n", " item[\"identifier\"], page\n", " )\n", " response = requests.get(img_url, stream=True, verify=False)\n", " response.raise_for_status()\n", " with open(filename, \"wb\") as out_file:\n", " shutil.copyfileobj(response.raw, out_file)\n", " time.sleep(0.5)\n", " shutil.make_archive(dir_path, \"zip\", dir_path)\n", " link = f\"{dir_path}.zip\"\n", " display(\n", " HTML(\n", " f'Download zipped images: {link}'\n", " )\n", " )\n", " else:\n", " print(\"Sorry, that item has not been digitised...\")\n", " else:\n", " print(\"You need to provide a barcode!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "barcode = widgets.Text(\n", " placeholder=\"Enter item barcode\", description=\"Barcode:\", disabled=False\n", ")\n", "display(barcode)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "button = widgets.Button(\n", " description=\"Get images\",\n", " disabled=False,\n", " button_style=\"primary\", # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip=\"Click to harvest images\",\n", " icon=\"\",\n", ")\n", "button.on_click(get_images)\n", "display(button)\n", "\n", "out = widgets.Output()\n", "display(out)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "# Load environment variables if available\n", "%load_ext dotenv\n", "%dotenv" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# TESTING\n", "if os.getenv(\"GW_STATUS\") == \"dev\":\n", " barcode.value = \"149309\"\n", " button.click()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherratt](https://timsherratt.org/) as part of the [GLAM Workbench](https://glam-workbench.github.io/)." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": { "02a0f72e2c7c4ea19823ac7dde9ccdcb": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "0411a2330cb64363b3681f16f43f8597": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "05366ae7030b4caaa31fb5c3810632ef": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "1481a91670dc4410afa67b8a1bb78f9b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "bar_style": "success", "layout": "IPY_MODEL_95f3be4142a8437d9e27e7af8379fe49", "max": 4, "style": "IPY_MODEL_3c5dc96e763f46e1b04c6da15c175d19", "value": 4 } }, "149131dfdcfe4988b4bb4419681ad836": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonModel", "state": { "button_style": "primary", "description": "Get images", "layout": "IPY_MODEL_21fda09c8f8e43f28bd56049d06bbc8a", "style": "IPY_MODEL_8c13b9065b9745fcb3a891187e20b71b", "tooltip": "Click to harvest images" } }, "21fda09c8f8e43f28bd56049d06bbc8a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "248bbe4e8f854003af9bdea8c4f91462": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "2c1fc82b1d514da89c0a0bec87b99045": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "children": [ "IPY_MODEL_9a70e14525484a37a2075760fba15e5b", "IPY_MODEL_46af03bf4c064208a153aa92e981a0e4", "IPY_MODEL_a44abe26ea88469abd99369fd7833b9a" ], "layout": "IPY_MODEL_d4e890b865ab403597cc0e2fe7cfee95" } }, "3139cf03ac094ad59c31c2b7c86d0576": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "32f2f85369034a8a9c6dbd4d228febbd": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "description_width": "" } }, "36ad7981996444c18da7b03b5eb5b5a0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "3c1f9d319e034a7d9c4a7b0c8d717e84": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "children": [ "IPY_MODEL_9c12ac63779948a789d2ed03906ff220", "IPY_MODEL_e17bbf087e9e426691d6fd7a04bdf4ce", "IPY_MODEL_d3669af1db404db695a73ff8d1390970" ], "layout": "IPY_MODEL_f30a0ac01b2c45eeaddfd21fc5358e26" } }, "3c5dc96e763f46e1b04c6da15c175d19": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "description_width": "" } }, "40a221c113844fd19d2d9da426596634": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonModel", "state": { "button_style": "primary", "description": "Get images", "layout": "IPY_MODEL_76269166c07c448a9965046b604820fa", "style": "IPY_MODEL_449514f5f4b7414baa4d357e367a412b", "tooltip": "Click to harvest images" } }, "433a198607474420904eb0b9c65838b5": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "4426a48f05df4b3196c94550c7eab221": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "TextModel", "state": { "description": "Barcode:", "layout": "IPY_MODEL_433a198607474420904eb0b9c65838b5", "placeholder": "Enter item barcode", "style": "IPY_MODEL_c27e610c98ee414e8c6c06c614a26359", "value": "149309" } }, "449514f5f4b7414baa4d357e367a412b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonStyleModel", "state": {} }, "46af03bf4c064208a153aa92e981a0e4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "bar_style": "success", "layout": "IPY_MODEL_248bbe4e8f854003af9bdea8c4f91462", "max": 4, "style": "IPY_MODEL_990dfd30905f4c7095471a3f926ea82c", "value": 4 } }, "650b813552e44edfbdcc3a48e02d16a6": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "TextModel", "state": { "description": "Barcode:", "layout": "IPY_MODEL_bf6f82db2ba04a27a3ee1cf8f80db6e1", "placeholder": "Enter item barcode", "style": "IPY_MODEL_e1f96d9e0ed64604ac906d08e8dca497", "value": "149309" } }, "6ec1eea60e4046128078509ab1a526aa": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "74042098381f4436a23566d311016718": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "7520364341d343cdb2b626e81cb0b823": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "children": [ "IPY_MODEL_838113bfa73542aba421eda99deee98e", "IPY_MODEL_1481a91670dc4410afa67b8a1bb78f9b", "IPY_MODEL_f749f620f4ad4a46878222812a326e45" ], "layout": "IPY_MODEL_7cc37780f3294e21816ee9387a7b22eb" } }, "76269166c07c448a9965046b604820fa": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "7cc37780f3294e21816ee9387a7b22eb": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "838113bfa73542aba421eda99deee98e": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_8dab271b273a4f8a943d22922d2eef97", "style": "IPY_MODEL_a2c335a239464cf88445822b4dc8eeff", "value": "100%" } }, "87f3186a4227465d924dd3cdbbe450ef": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "8a1f61eedbd24b7b8e4a8d61a44f4a55": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "8c13b9065b9745fcb3a891187e20b71b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonStyleModel", "state": {} }, "8dab271b273a4f8a943d22922d2eef97": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "8e14efaa09a04a4eb1fc20f5625f7c80": { "model_module": "@jupyter-widgets/output", "model_module_version": "1.0.0", "model_name": "OutputModel", "state": { "layout": "IPY_MODEL_9833ecd5f660493e980c78caf7a6c761", "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": " 0%| | 0/4 [00:00data/images/a2479-17-1306-[149309].zip", "text/plain": "" }, "metadata": {}, "output_type": "display_data" } ] } }, "95f3be4142a8437d9e27e7af8379fe49": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "9833ecd5f660493e980c78caf7a6c761": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "990dfd30905f4c7095471a3f926ea82c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "description_width": "" } }, "9a70e14525484a37a2075760fba15e5b": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_8a1f61eedbd24b7b8e4a8d61a44f4a55", "style": "IPY_MODEL_d8daafec94df4accaa22cd31dba2205f", "value": "100%" } }, "9c12ac63779948a789d2ed03906ff220": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_6ec1eea60e4046128078509ab1a526aa", "style": "IPY_MODEL_cfe517d43225494dbd8e583268226cb0", "value": "100%" } }, "a2c335a239464cf88445822b4dc8eeff": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "a44abe26ea88469abd99369fd7833b9a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_02a0f72e2c7c4ea19823ac7dde9ccdcb", "style": "IPY_MODEL_36ad7981996444c18da7b03b5eb5b5a0", "value": " 4/4 [00:02<00:00, 1.98it/s]" } }, "a496ce7d46054f44a345114ec6d0de4d": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "a787eee98b6e4ef2a332b9d2183a7a48": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "a90e4c9716754d55883e2b779c705998": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "b479264dc60d475e8072813bcadf7211": { "model_module": "@jupyter-widgets/output", "model_module_version": "1.0.0", "model_name": "OutputModel", "state": { "layout": "IPY_MODEL_dc569c76da5f4c01b00762eaafe47577", "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": " 0%| | 0/4 [00:00data/images/a2479-17-1306-[149309].zip", "text/plain": "" }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": " 0%| | 0/4 [00:00data/images/a2479-17-1306-[149309].zip", "text/plain": "" }, "metadata": {}, "output_type": "display_data" } ] } }, "b6cba3d84e44472e864207eb73073bad": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "bf6f82db2ba04a27a3ee1cf8f80db6e1": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "c27e610c98ee414e8c6c06c614a26359": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "c4591ef9048041fcaaff0a050756062e": { "model_module": "@jupyter-widgets/output", "model_module_version": "1.0.0", "model_name": "OutputModel", "state": { "layout": "IPY_MODEL_0411a2330cb64363b3681f16f43f8597", "outputs": [ { "ename": "TypeError", "evalue": "'RSItem' object is not subscriptable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Input \u001b[0;32mIn [2]\u001b[0m, in \u001b[0;36mget_images\u001b[0;34m(b)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m barcode\u001b[38;5;241m.\u001b[39mvalue:\n\u001b[1;32m 4\u001b[0m item \u001b[38;5;241m=\u001b[39m RSItem(barcode\u001b[38;5;241m.\u001b[39mvalue)\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mitem\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdigitised_pages\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 6\u001b[0m series \u001b[38;5;241m=\u001b[39m slugify(item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mseries\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 7\u001b[0m control \u001b[38;5;241m=\u001b[39m slugify(item[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcontrol_symbol\u001b[39m\u001b[38;5;124m'\u001b[39m])\n", "\u001b[0;31mTypeError\u001b[0m: 'RSItem' object is not subscriptable" ] } ] } }, "c6a2ee8c6c58495e86d8c60e54ba94be": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonStyleModel", "state": {} }, "cfe517d43225494dbd8e583268226cb0": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "d3669af1db404db695a73ff8d1390970": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_a496ce7d46054f44a345114ec6d0de4d", "style": "IPY_MODEL_74042098381f4436a23566d311016718", "value": " 4/4 [00:03<00:00, 1.00it/s]" } }, "d4e890b865ab403597cc0e2fe7cfee95": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "d7384894b83248f3ad1aff778c6567d6": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "TextModel", "state": { "description": "Barcode:", "layout": "IPY_MODEL_3139cf03ac094ad59c31c2b7c86d0576", "placeholder": "Enter item barcode", "style": "IPY_MODEL_05366ae7030b4caaa31fb5c3810632ef", "value": "149309" } }, "d8daafec94df4accaa22cd31dba2205f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "dc569c76da5f4c01b00762eaafe47577": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "df7fe83d91cb4dc29aefd501a1c5b151": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "e1745f593eba4a0e89b94527b24ba669": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "e17bbf087e9e426691d6fd7a04bdf4ce": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "bar_style": "success", "layout": "IPY_MODEL_87f3186a4227465d924dd3cdbbe450ef", "max": 4, "style": "IPY_MODEL_32f2f85369034a8a9c6dbd4d228febbd", "value": 4 } }, "e1f96d9e0ed64604ac906d08e8dca497": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "description_width": "" } }, "e905c85fa8a04c03b1d2798b97bbb69d": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ButtonModel", "state": { "button_style": "primary", "description": "Get images", "layout": "IPY_MODEL_b6cba3d84e44472e864207eb73073bad", "style": "IPY_MODEL_c6a2ee8c6c58495e86d8c60e54ba94be", "tooltip": "Click to harvest images" } }, "f30a0ac01b2c45eeaddfd21fc5358e26": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": {} }, "f749f620f4ad4a46878222812a326e45": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "layout": "IPY_MODEL_e1745f593eba4a0e89b94527b24ba669", "style": "IPY_MODEL_a90e4c9716754d55883e2b779c705998", "value": " 4/4 [00:02<00:00, 1.98it/s]" } }, "fd794cf0c224411c8cc3f3a532ad3e4f": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "TextModel", "state": { "description": "Barcode:", "layout": "IPY_MODEL_a787eee98b6e4ef2a332b9d2183a7a48", "placeholder": "Enter item barcode", "style": "IPY_MODEL_df7fe83d91cb4dc29aefd501a1c5b151", "value": "149309" } } }, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }