# Harvesting series

In [8]:
import time
import csv
import os
import math
import string
import requests
import pandas as pd
from PIL import Image, ImageOps
from requests import ConnectionError
from recordsearch_tools.utilities import retry
from recordsearch_tools.client import RSSearchClient, RSSeriesClient
from tinydb import TinyDB, Query
try:
 from io import BytesIO
except ImportError:
 from StringIO import StringIO
from IPython.display import Image as DImage
from IPython.core.display import HTML

# Plotly helps us make pretty charts
import plotly.offline as py
import plotly.graph_objs as go

# This lets Plotly draw charts in cells
py.init_notebook_mode()

In [2]:
# What series do you want to harvest?
# Insert the series id between the quotes.
series = 'B13'

## The harvesting code

In [13]:
class SeriesHarvester():
 def __init__(self, series, control=None, images_only=False):
 self.series = series
 self.control = control
 if not images_only:
 self.total_pages = None
 self.pages_complete = 0
 self.client = RSSearchClient()
 self.prepare_harvest()
 self.db = TinyDB('data/db-{}.json'.format(self.series.replace('/', '-')))
 self.items = self.db.table('items')
 self.images = self.db.table('images')

 def get_total(self):
 return self.client.total_results

 def prepare_harvest(self):
 if self.control:
 self.client.search(series=self.series, control=self.control)
 else:
 self.client.search(series=self.series)
 total_results = self.client.total_results
 print('{} items'.format(total_results))
 self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1
 print(self.total_pages)

 @retry(ConnectionError, tries=20, delay=10, backoff=1)
 def start_harvest(self, page=None):
 Record = Query()
 if not page:
 page = self.pages_complete + 1
 while self.pages_complete < self.total_pages:
 if self.control:
 response = self.client.search(series=self.series, page=page, control=self.control, sort='9')
 else:
 response = self.client.search(series=self.series, page=page, sort='9')
 for result in response['results']:
 self.items.upsert(result, Record.identifier == result['identifier'])
 self.pages_complete += 1
 page += 1
 print('{} pages complete'.format(self.pages_complete))
 time.sleep(1)
 
 @retry(ConnectionError, tries=20, delay=10, backoff=1)
 def harvest_images(self):
 Record = Query()
 items = self.items.search(Record.digitised_status == True)
 headers = {'User-Agent': 'Mozilla/5.0'}
 for item in items:
 directory = os.path.join('data', 'images', '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-').replace(' ', '-'), item['identifier']))
 if not os.path.exists(directory):
 os.makedirs(directory)
 for page in range(1, item['digitised_pages'] + 1):
 filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)
 print('{}, p. {}'.format(item['identifier'], page))
 if not os.path.exists(filename):
 img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)
 response = requests.get(img_url, headers=headers, stream=True, verify=False)
 response.raise_for_status()
 try:
 image = Image.open(BytesIO(response.content))
 except IOError:
 print('Not an image')
 else:
 width, height = image.size
 image.save(filename)
 del response
 image_meta = {
 'image_id': '{}-{}'.format(item['identifier'], page),
 'identifier': item['identifier'],
 'page': page,
 'width': width,
 'height': height
 }
 self.images.upsert(image_meta, Record.image_id == image_meta['image_id'])
 print('Image saved')
 time.sleep(1)

In [4]:
def harvest_series(series):
 h = SeriesHarvester(series=series)
 h.start_harvest()

In [5]:
def harvest_large_series(series, control_range=None):
 '''
 RecordSearch will not return more than 20,000 results.
 If a series has more than 20,000 items you'll need to break it up.
 The easiest way to do this is to add a param for control_symbol.
 This function will break break a series harvest down into a series of harvests --
 using each letter and number with a wildcard as the control_symbol parameter.
 This should be enough to harvest most large series, but in some cases you might need to supply a custom list of control_symbol prefixes.
 '''
 if not control_range:
 control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(number) + '*' for number in range(0, 10)]
 for control in control_range:
 print(control)
 h = SeriesHarvester(series=series, control=control)
 h.start_harvest()
 

In [16]:
def harvest_images(series, images_only=False):
 h = SeriesHarvester(series=series, images_only=images_only)
 h.harvest_images()

## Functions to process harvested data

In [5]:
def convert_to_df(series):
 '''
 Get the series data from TinyDB and save as a Pandas dataframe.
 Also flattens the date dictionary, and does a bit of ordering.
 '''
 
 # Load the series db
 db = TinyDB('data/db-{}.json'.format(series.replace('/', '-')))
 items = db.table('items')
 
 # Let's convert the database into a simple list
 item_list = [i for i in items]
 
 # Now let's turm that list into a Pandas Dataframe
 df = pd.DataFrame(item_list)
 
 # The 'contents_date' column is a dictionary, we need to flatten this out so we can easily work with the values
 df = pd.concat([df, pd.DataFrame((d for idx, d in df['contents_dates'].iteritems()))], axis=1)

 # Delete the old date field
 del df['contents_dates']

 # Rename column
 df.rename({'date_str': 'contents_dates'}, axis=1, inplace=True)
 
 # Put columns in preferred order
 df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_dates', 'start_date', 'end_date', 'access_status', 'location', 'digitised_status', 'digitised_pages']]
 df.sort_values(['identifier'])
 
 return df 

In [6]:
def save_as_csv(series):
 '''
 Converts harvested data in TinyDB to a CSV file, via a Pandas dataframe.
 '''
 df = convert_to_df(series)
 df.to_csv('data/{}.csv'.format(series.replace('/', '-')), index=False)

## Running the harvests

In [1]:
# This is a list of series relating to the White Australia Policy
# Large series (> 20,000 items) will be harvested separately
series_list = ['B6003', 'BP343/15', 'D2860', 'D5036', 'D596', 'E752', 'J2481', 'J2482', 'J2483', 'J3115', 'K1145', 'P437', 'P526', 'PP4/2', 'PP6/1', 'SP11/26', 'SP11/6', 'SP115/1', 'SP115/10', 'SP42/1', 'SP726/1', 'ST84/1']

In [None]:
# Loop through series list, harvesting each in turn
for s in series_list:
 harvest_series(s)

In [17]:
# Loop through series list, harvesting each in turn
for s in ['B13']:
 harvest_images(s, images_only=True)

5938961, p. 1
Image saved
5938961, p. 2
Image saved
5938961, p. 3
Image saved
5938961, p. 4
Image saved
5938961, p. 5
Image saved
5938961, p. 6
Image saved
5938961, p. 7
Image saved
5938961, p. 8
Image saved
5938961, p. 9
Image saved
5938961, p. 10
Image saved
5938961, p. 11
Image saved
5938961, p. 12
Image saved
5938961, p. 13
Image saved
5938961, p. 14
Image saved
5938961, p. 15
Image saved
5938961, p. 16
Image saved
5938961, p. 17
Image saved
5938961, p. 18
Image saved
5938961, p. 19
Image saved
5938961, p. 20
Image saved
5938961, p. 21
Image saved
5938961, p. 22
Image saved
5938961, p. 23
Image saved
5938961, p. 24
Image saved
5938961, p. 25
Image saved
5938961, p. 26
Image saved
5938961, p. 27
Image saved
5938961, p. 28
Image saved
5938961, p. 29
Image saved
5938961, p. 30
Image saved
5938961, p. 31
Image saved
5938961, p. 32
Image saved
5938961, p. 33
Image saved
5938961, p. 34
Image saved
5938961, p. 35
Image saved
5938961, p. 36
Image saved
5938961, p. 37
Image saved
5938961, p

Image saved
406310, p. 94
Image saved
406310, p. 95
Image saved
406310, p. 96
Image saved
406310, p. 97
Image saved
406310, p. 98
Image saved
406310, p. 99
Image saved
406310, p. 100
Image saved
406310, p. 101
Image saved
406310, p. 102
Image saved
406310, p. 103
Image saved
406310, p. 104
Image saved
406310, p. 105
Image saved
406310, p. 106
Image saved
406310, p. 107
Image saved
406310, p. 108
Image saved
406310, p. 109
Image saved
406310, p. 110
Image saved
406310, p. 111
Image saved
406310, p. 112
Image saved
406310, p. 113
Image saved
406310, p. 114
Image saved
406310, p. 115
Image saved
406310, p. 116
Image saved
406310, p. 117
Image saved
406310, p. 118
Image saved
406310, p. 119
Image saved
406310, p. 120
Image saved
406310, p. 121
Image saved
406310, p. 122
Image saved
406310, p. 123
Image saved
406391, p. 1
Image saved
406391, p. 2
Image saved
406391, p. 3
Image saved
406391, p. 4
Image saved
406391, p. 5
Image saved
406391, p. 6
Image saved
406391, p. 7
Image saved
406391, p

Image saved
777609, p. 2
Image saved
777609, p. 3
Image saved
777609, p. 4
Image saved
777609, p. 5
Image saved
777609, p. 6
Image saved
777609, p. 7
Image saved
777609, p. 8
Image saved
777609, p. 9
Image saved
777609, p. 10
Image saved
777609, p. 11
Image saved
777610, p. 1
Image saved
777610, p. 2
Image saved
777610, p. 3
Image saved
777610, p. 4
Image saved
777610, p. 5
Image saved
777610, p. 6
Image saved
777610, p. 7
Image saved
777610, p. 8
Image saved
777610, p. 9
Image saved
777610, p. 10
Image saved
777610, p. 11
Image saved
777610, p. 12
Image saved
777610, p. 13
Image saved
777610, p. 14
Image saved
777610, p. 15
Image saved
777610, p. 16
Image saved
777610, p. 17
Image saved
777610, p. 18
Image saved
777610, p. 19
Image saved
777610, p. 20
Image saved
777626, p. 1
Image saved
777626, p. 2
Image saved
777626, p. 3
Image saved
777626, p. 4
Image saved
777626, p. 5
Image saved
777626, p. 6
Image saved
777626, p. 7
Image saved
777626, p. 8
Image saved
777626, p. 9
Image saved


Image saved
778704, p. 4
Image saved
778704, p. 5
Image saved
778704, p. 6
Image saved
778704, p. 7
Image saved
778704, p. 8
Image saved
778704, p. 9
Image saved
778704, p. 10
Image saved
778704, p. 11
Image saved
778797, p. 1
Image saved
778797, p. 2
Image saved
778797, p. 3
Image saved
778797, p. 4
Image saved
778797, p. 5
Image saved
778797, p. 6
Image saved
778797, p. 7
Image saved
778797, p. 8
Image saved
778797, p. 9
Image saved
778797, p. 10
Image saved
778830, p. 1
Image saved
778830, p. 2
Image saved
778830, p. 3
Image saved
778830, p. 4
Image saved
778830, p. 5
Image saved
778830, p. 6
Image saved
778830, p. 7
Image saved
778830, p. 8
Image saved
778898, p. 1
Image saved
778898, p. 2
Image saved
778898, p. 3
Image saved
778898, p. 4
Image saved
778898, p. 5
Image saved
778898, p. 6
Image saved
778898, p. 7
Image saved
778898, p. 8
Image saved
778904, p. 1
Image saved
778904, p. 2
Image saved
778904, p. 3
Image saved
778904, p. 4
Image saved
778904, p. 5
Image saved
778904, p.

Image saved
784158, p. 21
Image saved
784158, p. 22
Image saved
784158, p. 23
Image saved
784158, p. 24
Image saved
784158, p. 25
Image saved
784158, p. 26
Image saved
784158, p. 27
Image saved
784158, p. 28
Image saved
784158, p. 29
Image saved
784158, p. 30
Image saved
784158, p. 31
Image saved
784188, p. 1
Image saved
784188, p. 2
Image saved
784188, p. 3
Image saved
784188, p. 4
Image saved
784188, p. 5
Image saved
784188, p. 6
Image saved
784188, p. 7
Image saved
784188, p. 8
Image saved
784188, p. 9
Image saved
784188, p. 10
Image saved
784188, p. 11
Image saved
784188, p. 12
Image saved
784188, p. 13
Image saved
784188, p. 14
Image saved
784188, p. 15
Image saved
784188, p. 16
Image saved
784188, p. 17
Image saved
784188, p. 18
Image saved
784188, p. 19
Image saved
784217, p. 1
Image saved
784217, p. 2
Image saved
784217, p. 3
Image saved
784217, p. 4
Image saved
784217, p. 5
Image saved
784217, p. 6
Image saved
784217, p. 7
Image saved
784217, p. 8
Image saved
784217, p. 9
Imag

Image saved
788350, p. 1
Image saved
788350, p. 2
Image saved
788350, p. 3
Image saved
788350, p. 4
Image saved
788350, p. 5
Image saved
788502, p. 1
Image saved
788502, p. 2
Image saved
788502, p. 3
Image saved
788502, p. 4
Image saved
788502, p. 5
Image saved
788502, p. 6
Image saved
788502, p. 7
Image saved
788502, p. 8
Image saved
788502, p. 9
Image saved
788502, p. 10
Image saved
788502, p. 11
Image saved
788502, p. 12
Image saved
788502, p. 13
Image saved
788502, p. 14
Image saved
788502, p. 15
Image saved
788502, p. 16
Image saved
788502, p. 17
Image saved
788625, p. 1
Image saved
788625, p. 2
Image saved
788625, p. 3
Image saved
788625, p. 4
Image saved
788625, p. 5
Image saved
788625, p. 6
Image saved
788625, p. 7
Image saved
788625, p. 8
Image saved
788625, p. 9
Image saved
788625, p. 10
Image saved
788625, p. 11
Image saved
788625, p. 12
Image saved
788625, p. 13
Image saved
788625, p. 14
Image saved
788625, p. 15
Image saved
788625, p. 16
Image saved
788625, p. 17
Image sav

Image saved
790898, p. 6
Image saved
790898, p. 7
Image saved
791352, p. 1
Image saved
791352, p. 2
Image saved
791352, p. 3
Image saved
791352, p. 4
Image saved
791352, p. 5
Image saved
791352, p. 6
Image saved
791352, p. 7
Image saved
791352, p. 8
Image saved
791352, p. 9
Image saved
791352, p. 10
Image saved
791352, p. 11
Image saved
791352, p. 12
Image saved
791352, p. 13
Image saved
791352, p. 14
Image saved
791352, p. 15
Image saved
791352, p. 16
Image saved
791352, p. 17
Image saved
791352, p. 18
Image saved
791352, p. 19
Image saved
791407, p. 1
Image saved
791407, p. 2
Image saved
791407, p. 3
Image saved
791407, p. 4
Image saved
791407, p. 5
Image saved
791407, p. 6
Image saved
791407, p. 7
Image saved
791407, p. 8
Image saved
791407, p. 9
Image saved
791407, p. 10
Image saved
791407, p. 11
Image saved
791407, p. 12
Image saved
791407, p. 13
Image saved
791407, p. 14
Image saved
791407, p. 15
Image saved
791407, p. 16
Image saved
791407, p. 17
Image saved
791407, p. 18
Image 

Image saved
797679, p. 3
Image saved
797849, p. 1
Image saved
797849, p. 2
Image saved
797849, p. 3
Image saved
797849, p. 4
Image saved
797849, p. 5
Image saved
797849, p. 6
Image saved
797849, p. 7
Image saved
797849, p. 8
Image saved
797849, p. 9
Image saved
797849, p. 10
Image saved
797849, p. 11
Image saved
797849, p. 12
Image saved
797849, p. 13
Image saved
797849, p. 14
Image saved
797849, p. 15
Image saved
797849, p. 16
Image saved
798305, p. 1
Image saved
798305, p. 2
Image saved
798305, p. 3
Image saved
798305, p. 4
Image saved
798305, p. 5
Image saved
798566, p. 1
Image saved
798566, p. 2
Image saved
798566, p. 3
Image saved
798566, p. 4
Image saved
798566, p. 5
Image saved
798566, p. 6
Image saved
798566, p. 7
Image saved
798566, p. 8
Image saved
798566, p. 9
Image saved
798566, p. 10
Image saved
798566, p. 11
Image saved
798566, p. 12
Image saved
798566, p. 13
Image saved
798566, p. 14
Image saved
798566, p. 15
Image saved
798566, p. 16
Image saved
798566, p. 17
Image save

Image saved
800969, p. 58
Image saved
800969, p. 59
Image saved
800969, p. 60
Image saved
800969, p. 61
Image saved
800969, p. 62
Image saved
800969, p. 63
Image saved
801045, p. 1
Image saved
801045, p. 2
Image saved
801138, p. 1
Image saved
801138, p. 2
Image saved
801138, p. 3
Image saved
801138, p. 4
Image saved
801138, p. 5
Image saved
801138, p. 6
Image saved
801138, p. 7
Image saved
801146, p. 1
Image saved
801146, p. 2
Image saved
801146, p. 3
Image saved
801146, p. 4
Image saved
801146, p. 5
Image saved
801146, p. 6
Image saved
801146, p. 7
Image saved
801146, p. 8
Image saved
801146, p. 9
Image saved
801146, p. 10
Image saved
801146, p. 11
Image saved
801146, p. 12
Image saved
801146, p. 13
Image saved
801146, p. 14
Image saved
801146, p. 15
Image saved
801244, p. 1
Image saved
801244, p. 2
Image saved
801244, p. 3
Image saved
801244, p. 4
Image saved
801443, p. 1
Image saved
801443, p. 2
Image saved
801443, p. 3
Image saved
801458, p. 1
Image saved
801458, p. 2
Image saved
8

Image saved
804764, p. 24
Image saved
804764, p. 25
Image saved
804764, p. 26
Image saved
804764, p. 27
Image saved
804764, p. 28
Image saved
804764, p. 29
Image saved
804764, p. 30
Image saved
804764, p. 31
Image saved
804764, p. 32
Image saved
804764, p. 33
Image saved
804764, p. 34
Image saved
804764, p. 35
Image saved
804764, p. 36
Image saved
804764, p. 37
Image saved
804764, p. 38
Image saved
804764, p. 39
Image saved
804764, p. 40
Image saved
804764, p. 41
Image saved
804764, p. 42
Image saved
804764, p. 43
Image saved
804764, p. 44
Image saved
804764, p. 45
Image saved
804764, p. 46
Image saved
804764, p. 47
Image saved
804764, p. 48
Image saved
804764, p. 49
Image saved
804764, p. 50
Image saved
804764, p. 51
Image saved
804764, p. 52
Image saved
804764, p. 53
Image saved
804764, p. 54
Image saved
804764, p. 55
Image saved
804764, p. 56
Image saved
804764, p. 57
Image saved
804764, p. 58
Image saved
804764, p. 59
Image saved
804764, p. 60
Image saved
804764, p. 61
Image saved


Image saved
809730, p. 4
Image saved
809730, p. 5
Image saved
809784, p. 1
Image saved
809784, p. 2
Image saved
809784, p. 3
Image saved
809784, p. 4
Image saved
809784, p. 5
Image saved
809784, p. 6
Image saved
809784, p. 7
Image saved
809784, p. 8
Image saved
809999, p. 1
Image saved
809999, p. 2
Image saved
809999, p. 3
Image saved
809999, p. 4
Image saved
809999, p. 5
Image saved
809999, p. 6
Image saved
809999, p. 7
Image saved
809999, p. 8
Image saved
809999, p. 9
Image saved
809999, p. 10
Image saved
809999, p. 11
Image saved
809999, p. 12
Image saved
809999, p. 13
Image saved
809999, p. 14
Image saved
809999, p. 15
Image saved
809999, p. 16
Image saved
809999, p. 17
Image saved
809999, p. 18
Image saved
809999, p. 19
Image saved
810028, p. 1
Image saved
810028, p. 2
Image saved
810028, p. 3
Image saved
810028, p. 4
Image saved
810028, p. 5
Image saved
810028, p. 6
Image saved
810028, p. 7
Image saved
810133, p. 1
Image saved
810133, p. 2
Image saved
810133, p. 3
Image saved
810

Image saved
814338, p. 24
Image saved
814338, p. 25
Image saved
814338, p. 26
Image saved
814338, p. 27
Image saved
814338, p. 28
Image saved
814338, p. 29
Image saved
814338, p. 30
Image saved
814338, p. 31
Image saved
814338, p. 32
Image saved
814338, p. 33
Image saved
814444, p. 1
Image saved
814444, p. 2
Image saved
814444, p. 3
Image saved
814444, p. 4
Image saved
814479, p. 1
Image saved
814479, p. 2
Image saved
814479, p. 3
Image saved
814479, p. 4
Image saved
814479, p. 5
Image saved
814479, p. 6
Image saved
814479, p. 7
Image saved
814479, p. 8
Image saved
814479, p. 9
Image saved
814479, p. 10
Image saved
814479, p. 11
Image saved
814479, p. 12
Image saved
814479, p. 13
Image saved
814479, p. 14
Image saved
814656, p. 1
Image saved
814656, p. 2
Image saved
814656, p. 3
Image saved
814656, p. 4
Image saved
814656, p. 5
Image saved
814656, p. 6
Image saved
814656, p. 7
Image saved
815104, p. 1
Image saved
815126, p. 1
Image saved
815126, p. 2
Image saved
815126, p. 3
Image save

Image saved
815961, p. 25
Image saved
815961, p. 26
Image saved
815961, p. 27
Image saved
815961, p. 28
Image saved
815972, p. 1
Image saved
815972, p. 2
Image saved
815972, p. 3
Image saved
815972, p. 4
Image saved
815972, p. 5
Image saved
815972, p. 6
Image saved
815972, p. 7
Image saved
815972, p. 8
Image saved
815972, p. 9
Image saved
815972, p. 10
Image saved
815972, p. 11
Image saved
815972, p. 12
Image saved
815972, p. 13
Image saved
815972, p. 14
Image saved
815972, p. 15
Image saved
815972, p. 16
Image saved
815972, p. 17
Image saved
815972, p. 18
Image saved
815972, p. 19
Image saved
815972, p. 20
Image saved
815972, p. 21
Image saved
815972, p. 22
Image saved
815972, p. 23
Image saved
815972, p. 24
Image saved
815972, p. 25
Image saved
815972, p. 26
Image saved
815972, p. 27
Image saved
815972, p. 28
Image saved
815972, p. 29
Image saved
815972, p. 30
Image saved
815972, p. 31
Image saved
815972, p. 32
Image saved
815972, p. 33
Image saved
815972, p. 34
Image saved
815972, p

Image saved
817400, p. 21
Image saved
817400, p. 22
Image saved
817400, p. 23
Image saved
817400, p. 24
Image saved
817400, p. 25
Image saved
817400, p. 26
Image saved
817400, p. 27
Image saved
817400, p. 28
Image saved
817400, p. 29
Image saved
817400, p. 30
Image saved
817400, p. 31
Image saved
817400, p. 32
Image saved
817400, p. 33
Image saved
817400, p. 34
Image saved
817400, p. 35
Image saved
817400, p. 36
Image saved
817400, p. 37
Image saved
817400, p. 38
Image saved
817400, p. 39
Image saved
817400, p. 40
Image saved
817400, p. 41
Image saved
817400, p. 42
Image saved
817400, p. 43
Image saved
817463, p. 1
Image saved
817463, p. 2
Image saved
817463, p. 3
Image saved
817476, p. 1
Image saved
817476, p. 2
Image saved
817476, p. 3
Image saved
817633, p. 1
Image saved
817633, p. 2
Image saved
817633, p. 3
Image saved
817645, p. 1
Image saved
817645, p. 2
Image saved
817645, p. 3
Image saved
817645, p. 4
Image saved
817645, p. 5
Image saved
817645, p. 6
Image saved
817645, p. 7
Im

Image saved
820632, p. 7
Image saved
820632, p. 8
Image saved
820632, p. 9
Image saved
820632, p. 10
Image saved
820632, p. 11
Image saved
820632, p. 12
Image saved
820632, p. 13
Image saved
820632, p. 14
Image saved
820632, p. 15
Image saved
820632, p. 16
Image saved
820632, p. 17
Image saved
820632, p. 18
Image saved
820632, p. 19
Image saved
820632, p. 20
Image saved
820632, p. 21
Image saved
820632, p. 22
Image saved
820632, p. 23
Image saved
820720, p. 1
Image saved
820720, p. 2
Image saved
820720, p. 3
Image saved
820720, p. 4
Image saved
820720, p. 5
Image saved
820720, p. 6
Image saved
820720, p. 7
Image saved
820720, p. 8
Image saved
820720, p. 9
Image saved
820720, p. 10
Image saved
820720, p. 11
Image saved
820720, p. 12
Image saved
820720, p. 13
Image saved
820720, p. 14
Image saved
820720, p. 15
Image saved
820720, p. 16
Image saved
820720, p. 17
Image saved
820720, p. 18
Image saved
820720, p. 19
Image saved
820720, p. 20
Image saved
820720, p. 21
Image saved
820720, p. 2

Image saved
12205163, p. 22
Image saved
12205163, p. 23
Image saved
12205163, p. 24
Image saved
12205163, p. 25
Image saved
12205163, p. 26
Image saved
12205163, p. 27
Image saved
12205163, p. 28
Image saved
12205163, p. 29
Image saved
12205163, p. 30
Image saved
12205163, p. 31
Image saved
12205163, p. 32
Image saved
12205163, p. 33
Image saved
12205163, p. 34
Image saved
12205163, p. 35
Image saved
12205163, p. 36
Image saved
12205163, p. 37
Image saved
12205163, p. 38
Image saved
12205163, p. 39
Image saved
12205163, p. 40
Image saved
12205163, p. 41
Image saved
12205163, p. 42
Image saved
12205163, p. 43
Image saved
12205163, p. 44
Image saved
12205163, p. 45
Image saved
12205163, p. 46
Image saved
12205163, p. 47
Image saved
12205163, p. 48
Image saved
12205163, p. 49
Image saved
12205163, p. 50
Image saved
12205163, p. 51
Image saved
12205163, p. 52
Image saved
12205163, p. 53
Image saved
12205163, p. 54
Image saved
12205163, p. 55
Image saved
12205163, p. 56
Image saved
12205163

In [None]:
# B13 is > 20,000 items
harvest_large_series('B13')

In [3]:
# A1 is a large series that needs a custom control range to harvest
# This generates a list of control_symbol prefixes that should break it down into harvestable chunks (< 20,000 items)
# This should also work with series that use the the current year as the prefix, eg 1935/190
# Just feed this range to harvest_large_series() -- eg. harvest_large_series('A1', control_range)
control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(num) + '*' for num in range(2,10)] + ['1{}*'.format(num2) for num2 in [str(num) for num in range(0,9)]] + ['19{}*'.format(num2) for num2 in [str(num) for num in range(0,10)]]
print(control_range)

['A*', 'B*', 'C*', 'D*', 'E*', 'F*', 'G*', 'H*', 'I*', 'J*', 'K*', 'L*', 'M*', 'N*', 'O*', 'P*', 'Q*', 'R*', 'S*', 'T*', 'U*', 'V*', 'W*', 'X*', 'Y*', 'Z*', '2*', '3*', '4*', '5*', '6*', '7*', '8*', '9*', '10*', '11*', '12*', '13*', '14*', '15*', '16*', '17*', '18*', '190*', '191*', '192*', '193*', '194*', '195*', '196*', '197*', '198*', '199*']


In [None]:
# Harvest A1
harvest_large_series('A1', control_range)

## Saving as CSV-formatted files

In [9]:
# Assuming you've already harvested the series!
for s in series_list:
 save_as_csv(s)

In [10]:
# Don't forget B13
save_as_csv('B13')