{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import pandas as pd\n",
    "import zipfile\n",
    "from zipfile import BadZipFile\n",
    "import io\n",
    "import os\n",
    "from tqdm.auto import tqdm\n",
    "import time\n",
    "import pandas as pd\n",
    "from fpdf import FPDF, HTMLMixin\n",
    "from pathlib import Path\n",
    "from PIL import Image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('trove_digitised_books.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "trove_ids = df.loc[df['pages'] == 1]['trove_id'].to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1834"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(trove_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/97/3k8bv0h154546cb53jdx3nvc0000gn/T/ipykernel_45558/1415393295.py:3: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0\n",
      "Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`\n",
      "  for trove_id in tqdm_notebook(trove_ids):\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9857dc43ae9843a087f1a6f9643f9e46",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1834 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "output_dir = 'images'\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "for trove_id in tqdm_notebook(trove_ids):\n",
    "    # Check to see if the first page of this issue has already been downloaded\n",
    "    if not os.path.exists('{}/{}-1.jpg'.format(output_dir, trove_id)):\n",
    "        url = 'https://nla.gov.au/{}/download?downloadOption=zip&firstPage=0&lastPage=0'.format(trove_id)\n",
    "        # Get the file\n",
    "        r = requests.get(url)\n",
    "        # The image is in a zip, so we need to extract the contents into the output directory\n",
    "        try:\n",
    "            z = zipfile.ZipFile(io.BytesIO(r.content))\n",
    "            z.extractall(output_dir)\n",
    "        except BadZipFile:\n",
    "            pass\n",
    "        time.sleep(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1582"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(list(set(trove_ids)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('trove_digitised_books.csv', keep_default_na=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PDF(FPDF, HTMLMixin):\n",
    "    pass\n",
    "\n",
    "pdf = PDF()\n",
    "pdf.set_image_filter(\"DCTDecode\")\n",
    "#pdf.add_font('serif', fname='/Users/tim/Library/Fonts/Norasi.ttf', uni=True)\n",
    "pdf.add_font('serif', fname='/System/Library/Fonts/Supplemental/Georgia.ttf', uni=True)\n",
    "pdf.set_font('serif', '', 12)\n",
    "pdf.compress = True\n",
    "pdf.set_left_margin(15)\n",
    "pdf.set_right_margin(15)\n",
    "pdf.set_top_margin(15)\n",
    "pdf.add_page()\n",
    "pdf.write_html(\"\"\"\n",
    "<h1>A miscellany of ephemera, oddities, and estrays</h1>\n",
    "<p>&nbsp;</p>\n",
    "<p>This collection comprises digitised items from the Trove book zone with nary but a single page. \n",
    "You will find an odd mix of posters, pamphlets, advertisements, ephemera, and other assorted documents.</p>\n",
    "\n",
    "<p>It was compiled by <a href=\"https://timsherratt.org\">Tim Sherratt</a> to help researchers and promote use of Australia's digital cultural collections. \n",
    "The methods used to harvest the metadata and images are described in the <a href=\"https://glam-workbench.github.io/trove-books/\">Trove Books</a> \n",
    "section of the <a href=\"https://glam-workbench.github.io/\">GLAM Workbench</a>.</p>\n",
    " \n",
    "\"\"\")\n",
    "\n",
    "for row in df.loc[df['pages'] == 1].drop_duplicates(subset='trove_id').sort_values(by=['date', 'trove_id']).itertuples():\n",
    "    img_path = Path('images', f'{row.trove_id}-1.jpg')\n",
    "    tmp_path = Path('temp', f'{row.trove_id}-1.jpg')\n",
    "    if img_path.exists():\n",
    "        if row.contributors and row.date:\n",
    "            byline = f'<p>{row.contributors.replace(\"|\", \",\")} &middot; {row.date}</p>'\n",
    "        elif row.contributors or row.date:\n",
    "            byline = f'<p>{row.contributors.replace(\"|\", \",\")}{row.date}</p>'\n",
    "        else:\n",
    "            byline = ''\n",
    "        tmp_path = Path('temp', f'{row.trove_id}-1.jpg')\n",
    "        try:\n",
    "            img = Image.open(img_path)\n",
    "        except:\n",
    "            pass\n",
    "        else:\n",
    "            w, h = img.size\n",
    "            img.thumbnail((800, 800), resample=Image.LANCZOS)\n",
    "            if w > h:\n",
    "                img = img.transpose(Image.ROTATE_90)\n",
    "            img.save(tmp_path, quality=80)\n",
    "            pdf.add_page()\n",
    "            pdf.image(tmp_path, x=15, y=15, h=180)\n",
    "            pdf.ln(190)\n",
    "\n",
    "            pdf.write_html(f'<p>{row.title}</p>{byline}<p><a href=\"{row.fulltext_url}\">{row.fulltext_url}</a></p>')\n",
    "pdf.output(\"ephemera.pdf\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}