In [1]:
import requests
import pandas as pd
import zipfile
from zipfile import BadZipFile
import io
import os
from tqdm.auto import tqdm
import time
import pandas as pd
from fpdf import FPDF, HTMLMixin
from pathlib import Path
from PIL import Image

In [2]:
df = pd.read_csv('trove_digitised_books.csv')

In [3]:
trove_ids = df.loc[df['pages'] == 1]['trove_id'].to_list()

In [4]:
len(trove_ids)

1834

In [5]:
output_dir = 'images'
os.makedirs(output_dir, exist_ok=True)
for trove_id in tqdm_notebook(trove_ids):
    # Check to see if the first page of this issue has already been downloaded
    if not os.path.exists('{}/{}-1.jpg'.format(output_dir, trove_id)):
        url = 'https://nla.gov.au/{}/download?downloadOption=zip&firstPage=0&lastPage=0'.format(trove_id)
        # Get the file
        r = requests.get(url)
        # The image is in a zip, so we need to extract the contents into the output directory
        try:
            z = zipfile.ZipFile(io.BytesIO(r.content))
            z.extractall(output_dir)
        except BadZipFile:
            pass
        time.sleep(1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for trove_id in tqdm_notebook(trove_ids):


  0%|          | 0/1834 [00:00<?, ?it/s]

In [7]:
len(list(set(trove_ids)))

1582

In [37]:
df = pd.read_csv('trove_digitised_books.csv', keep_default_na=False)

In [111]:
class PDF(FPDF, HTMLMixin):
    pass

pdf = PDF()
pdf.set_image_filter("DCTDecode")
#pdf.add_font('serif', fname='/Users/tim/Library/Fonts/Norasi.ttf', uni=True)
pdf.add_font('serif', fname='/System/Library/Fonts/Supplemental/Georgia.ttf', uni=True)
pdf.set_font('serif', '', 12)
pdf.compress = True
pdf.set_left_margin(15)
pdf.set_right_margin(15)
pdf.set_top_margin(15)
pdf.add_page()
pdf.write_html("""
<h1>A miscellany of ephemera, oddities, and estrays</h1>
<p>&nbsp;</p>
<p>This collection comprises digitised items from the Trove book zone with nary but a single page. 
You will find an odd mix of posters, pamphlets, advertisements, ephemera, and other assorted documents.</p>

<p>It was compiled by <a href="https://timsherratt.org">Tim Sherratt</a> to help researchers and promote use of Australia's digital cultural collections. 
The methods used to harvest the metadata and images are described in the <a href="https://glam-workbench.github.io/trove-books/">Trove Books</a> 
section of the <a href="https://glam-workbench.github.io/">GLAM Workbench</a>.</p>
 
""")

for row in df.loc[df['pages'] == 1].drop_duplicates(subset='trove_id').sort_values(by=['date', 'trove_id']).itertuples():
    img_path = Path('images', f'{row.trove_id}-1.jpg')
    tmp_path = Path('temp', f'{row.trove_id}-1.jpg')
    if img_path.exists():
        if row.contributors and row.date:
            byline = f'<p>{row.contributors.replace("|", ",")} &middot; {row.date}</p>'
        elif row.contributors or row.date:
            byline = f'<p>{row.contributors.replace("|", ",")}{row.date}</p>'
        else:
            byline = ''
        tmp_path = Path('temp', f'{row.trove_id}-1.jpg')
        try:
            img = Image.open(img_path)
        except:
            pass
        else:
            w, h = img.size
            img.thumbnail((800, 800), resample=Image.LANCZOS)
            if w > h:
                img = img.transpose(Image.ROTATE_90)
            img.save(tmp_path, quality=80)
            pdf.add_page()
            pdf.image(tmp_path, x=15, y=15, h=180)
            pdf.ln(190)

            pdf.write_html(f'<p>{row.title}</p>{byline}<p><a href="{row.fulltext_url}">{row.fulltext_url}</a></p>')
pdf.output("ephemera.pdf")