# Scraping and Parsing: EAD XML Finding Aids from the Library of Congress

In [1]:
import os
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess

In [2]:
## Creating a directory called 'LOC_Metadata' and setting it as our current working directory

!mkdir /sharedfolder/LOC_Metadata

os.chdir('/sharedfolder/LOC_Metadata')

In [3]:
## Now we'll parse the page's HTML using BeautifulSoup ...

soup = BeautifulSoup(finding_aid_list_page, 'lxml')

## ... and examine soup.find_all('a'), which returns a list of 'a' elements (i.e., HTML links).

print(len(soup.find_all('a'))) # Checking the number of links on the page

print() # Printing a blank line for readability

print(soup.find_all('a')[70]) # Printing element #70 in the list



 
 
 
 
 
 Library of Congress Finding Aids: XML Source Files, Recorded Sound
 
 
 [XML]


In [5]:
## We can access the 'href' attribute of an element (i.e., the link URL) using 'href' in 
## brackets, just like a dictionary.

soup.find_all('a')[70]['href']

'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009003.2'

In [6]:
## Now let's make a list of every link on the page.

all_links = []

for element in soup.find_all('a'): # Looping through all 'a' elements.
 try: # Because some 'a' elements do not contain 'href' attributes, 
 all_links.append(element['href']) ## we can use a try/except statement to skip elements that 
 except: ## would otherwise raise an error.
 pass

all_links[:15] # Outputting the first 15 links in the list

['http://www.loc.gov',
 'http://www.loc.gov/rr/askalib/',
 'http://www.loc.gov/library/libarch-digital.html',
 'http://catalog.loc.gov/',
 'http://www.loc.gov',
 'http://www.loc.gov/rr/',
 '/index.html',
 '/index.html',
 '/index.html',
 '/browse/collections/a',
 '/browse/dates/main',
 '/browse/locations/main',
 '/browse/names/a',
 '/browse/titles/a',
 '/browse/subjects/a']

In [7]:
## We know that the URL for every XML file we're looking for ends in '.2', so we can
## use that fact to filter out irrelevant links.

xml_urls = []

for link in all_links:
 if link[-2:] == '.2': # Checking whether the last two characters of a link are '.2'
 xml_urls.append(link)

xml_urls # Outputting the full list of relevant XML URLs 

['http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs011001.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009003.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs005002.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs004004.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs012002.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs005001.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009006.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs008001.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs010002.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs008002.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs004002.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs000001.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009001.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs006002.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009004.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs010001.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs004001.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs011002.2',
 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs0060

In [8]:
## Downloading each XML file in our list of URLs

## We can use the subprocess module (which we imported above) to issue commands in the bash shell.
## In an interactive bash shell session we'd use spaces to separate arguments; instead, subprocess
## takes arguments in the form of a Python list.

## For each item in our list, the following issues a command with two arguments: 'wget' followed by the URL.
## It thus downloads each XML file to the current directory.

for url in xml_urls:
 subprocess.call(['wget', url])

In [9]:
## Outputting a list of filenames in the current directory

## In Unix-like operating systems, './' always refers to the current directory.

os.listdir('./')

['eadmbrs.rs000001.2',
 'eadmbrs.rs004001.2',
 'eadmbrs.rs004002.2',
 'eadmbrs.rs004003.2',
 'eadmbrs.rs004004.2',
 'eadmbrs.rs005001.2',
 'eadmbrs.rs005002.2',
 'eadmbrs.rs006001.2',
 'eadmbrs.rs006002.2',
 'eadmbrs.rs008001.2',
 'eadmbrs.rs008002.2',
 'eadmbrs.rs009001.2',
 'eadmbrs.rs009003.2',
 'eadmbrs.rs009004.2',
 'eadmbrs.rs009006.2',
 'eadmbrs.rs010001.2',
 'eadmbrs.rs010002.2',
 'eadmbrs.rs011001.2',
 'eadmbrs.rs011002.2',
 'eadmbrs.rs012001.2',
 'eadmbrs.rs012002.2']

In [10]:
## Just in case there are other files in the current directory, we can use a 
## list comprehension to create a list of filenames that end in '.2' and assign
## it to the variable 'xml_filenames'.

xml_filenames = [item for item in os.listdir('./') if item[-2:]=='.2']

xml_filenames

['eadmbrs.rs000001.2',
 'eadmbrs.rs004001.2',
 'eadmbrs.rs004002.2',
 'eadmbrs.rs004003.2',
 'eadmbrs.rs004004.2',
 'eadmbrs.rs005001.2',
 'eadmbrs.rs005002.2',
 'eadmbrs.rs006001.2',
 'eadmbrs.rs006002.2',
 'eadmbrs.rs008001.2',
 'eadmbrs.rs008002.2',
 'eadmbrs.rs009001.2',
 'eadmbrs.rs009003.2',
 'eadmbrs.rs009004.2',
 'eadmbrs.rs009006.2',
 'eadmbrs.rs010001.2',
 'eadmbrs.rs010002.2',
 'eadmbrs.rs011001.2',
 'eadmbrs.rs011002.2',
 'eadmbrs.rs012001.2',
 'eadmbrs.rs012002.2']

In [11]:
## OK, that's enough exploring. Let's use soup.find_all() to create a list of 'did' elements. 

did_elements = soup.find_all('did')

print(len(did_elements)) ## Printing the number of 'did' elements in our list

print()

print(did_elements[4]) ## Printing item #4 in the the list





2
Script for the Frank Sinatra
								Show, 1944 April 26




In [18]:
## Not every 'did' element contains the same fields; different objects are described differently.

## Try running this cell several times, plugging in other index numbers to compare the way
## different items' records are formatted.

print(did_elements[7])


5

Philip Morris Playhouse script for
							"Here Comes Mr. Jordan," 1944 February
							11




In [19]:
## If you run the cell above several times with different index numbers, you'll notice that the 
## first item in the list (index 0) refers to the entire box of records, while the others are 
## individual folders or series of folders.

## To make things more complicated, some items are physically described using 'container' elements 
## while others use 'extent' instead. Most appear to include 'unittitle' and 'unitdate'.

## Our goal is to create a CSV that contains a basic description of each 'unit', or 'did' element,
## in each XML finding aid. For the purposes of this exercise, let's include the following pieces 
## of information for each unit, where available:

#### title of the source collection
#### unittitle
#### unitdate
#### container type
#### container number
#### extent

In [20]:
## Since each XML finding aid represents a single collection, we'll want to include a column that 
## identifies which collection it comes from. By reading through the XML files, we see that each 
## has a single element called 'titleproper' that describes the whole collection.

## Let's create a recipe to extract that text. Here's a first try:

collection_title = soup.find('titleproper').get_text()

collection_title

'Manfred F. DeMartino Collection of CBS\n\t\t\t\t\tRadio Scripts '

In [21]:
## That format is OK, but we should remove the tab and newline characters. Let's try again, using 
## the replace() function to replace them with spaces.

collection_title = soup.find('titleproper').get_text().replace('\t', ' ').replace('\n', ' ')

collection_title

'Manfred F. DeMartino Collection of CBS Radio Scripts '

In [22]:
## We can add the strip() function to remove the space at the end of the string.

collection_title = soup.find('titleproper').get_text().replace('\t', ' ').replace('\n', ' ').strip()

collection_title

'Manfred F. DeMartino Collection of CBS Radio Scripts'

In [23]:
## We still have a series of spaces in a row in the middle of the string. We can use a 'while loop' 
## to repeatedly replace any occurrence of ' ' (two spaces) with ' ' (one space).

collection_title = soup.find('titleproper').get_text().replace('\t', ' ').replace('\n', ' ').strip()

while ' ' in collection_title:
 collection_title = collection_title.replace(' ', ' ')

collection_title

'Manfred F. DeMartino Collection of CBS Radio Scripts'

In [24]:
## Perfect. We'll extract the collection name whenever we open an XML finding aid and include it 
## in each CSV row associated with that collection.

In [25]:
## Now on to 'unittitle'. Recall that we created a list of 'did' elements above, called 'did_elements'.

element = did_elements[4]

unittitle = element.find('unittitle').get_text()

unittitle

'Script for the Frank Sinatra\n\t\t\t\t\t\t\t\tShow, 1944 April 26\n'

In [26]:
## Since those tabs and newlines are a recurring probem, we should define a function that 
## removes them from any given text string.

def clean_text(text):
 temp_text = text.replace('\t', ' ').replace('\n', ' ').strip()
 while ' ' in temp_text:
 temp_text = temp_text.replace(' ', ' ')
 return temp_text

In [27]:
# Let's test our clean_text() function.

element = did_elements[4]

unittitle = element.find('unittitle').get_text()

unittitle = clean_text(unittitle)

unittitle

'Script for the Frank Sinatra Show, 1944 April 26'

In [28]:
## Now let's try extracting the 'unittitle' field for each 'did' element in our list.

for element in did_elements:
 unittitle = element.get_text().replace('\t', ' ').replace('\n', ' ').strip()
 print(clean_text(unittitle))
 print('-----------------') # Printing a divider between elements

Collection Summary Manfred F. DeMartino Collection of CBS Radio Scripts 1943-1945 De Martino, Manfred F. .42 linear feet (1 box) Collection materials are in English Recorded Sound Reference Center, Motion Picture, Broadcasting and Recorded Sound Division Library of Congress Washington, D.C. Scripts and a photograph acquired by Manfred F. DeMartino while working backstage at CBS radio during the mid-1940s. Includes scripts for the Frank Sinatra Show, Philip Morris Playhouse, and Your Hit Parade. RPA 00189
-----------------
Series 1. Photograph, undated 1 folder
-----------------
1 Autographed photograph of Philip Morris spokesman Johnny Roventini, undated
-----------------
Series 2. Scripts, 1943-1945 8 folders
-----------------
2 Script for the Frank Sinatra Show, 1944 April 26
-----------------
3 Script for the Frank Sinatra Show, 1944 December 4
-----------------
4 Philip Morris Playhouse script for "Magnificent Obsession," 1944 January 27
-----------------
5 Philip Morris Playhouse 

In [29]:
## The first element in the list above contains more information than we need, but we can
## let that slide for this exercise.

## Next is 'unitdate'. We'll use our clean_text() function once again.

element = did_elements[4]

unitdate = element.find('unitdate').get_text()

unitdate = clean_text(unitdate)

unitdate

'1944 April 26'

In [30]:
## Let's loop through the list of 'did' elements and see if our 'unittitle' recipe holds up.

for element in did_elements:
 unitdate = element.find('unitdate').get_text()
 print(clean_text(unitdate))
 print('-----------------') # Printing a divider between elements

1943-1945
-----------------
undated
-----------------
undated
-----------------
1943-1945
-----------------
1944 April 26
-----------------
1944 December 4
-----------------
1944 January 27
-----------------
1944 February 11
-----------------
1944 February 18
-----------------
1943 October 16
-----------------
1944 April 8
-----------------
1945 August 25
-----------------


In [31]:
## Now on to container type and number. Let's examine a 'container' XML element.

element = did_elements[4]

element.find('container')

2

In [32]:
## Since the container type ('folder', in this case) is an attribute in the 'container' tag, 
## we can extract it using bracket notation.

element = did_elements[4]

container_type = element.find('container')['type']

container_type

'folder'

In [33]:
## The container number is specified between the opening and closing 'container' tags, 
## so we can get it using get_text().

element = did_elements[4]

container_number = element.find('container').get_text()

container_number

'2'

In [34]:
## Next we'll try to get the container type and number for each 'did' element in our list ...

for element in did_elements:
 container_type = element.find('container')['type']
 print(container_type)

 container_number = element.find('container').get_text()
 print(container_number)

 print('-----------------') # Printing a divider between elements

## ... and we get an error. The reason is that some 'did' elements don't include a 'container' field.

TypeError: 'NoneType' object is not subscriptable

In [35]:
## Using try/accept notation, whenever we get an error because a container element isn't found,
## we can revert to '' (an empty string) instead.

for element in did_elements:
 try:
 container_type = element.find('container')['type']
 except:
 container_type = ''
 print(container_type)
 
 try:
 container_number = element.find('container').get_text()
 except:
 container_number = ''
 print(container_number)
 print('-----------------') # Printing a divider between elements



-----------------


-----------------
folder
1
-----------------


-----------------
folder
2
-----------------
folder
3
-----------------
folder
4
-----------------
folder
5
-----------------
folder
6
-----------------
folder
7
-----------------
folder
8
-----------------
folder
9
-----------------


In [36]:
## The last field we'll extract is 'extent', which is only included in a handful of 'did' elements.

element = did_elements[3]

extent = element.find('extent').get_text()

extent

'8 folders'

In [37]:
## Let's extract 'extent' from each element in our list of 'did' elements (for those that happen to include it).

for element in did_elements:
 try:
 extent = element.find('extent').get_text()
 except:
 extent = ''
 print(extent)
 print('-----------------') # Printing a divider between elements

.42 linear feet (1 box)
-----------------
1 folder
-----------------

-----------------
8 folders
-----------------

-----------------

-----------------

-----------------

-----------------

-----------------

-----------------

-----------------

-----------------


In [38]:
## Let's put it all together and view our chosen fields for a single 'did' element.
## We will combine our fields in a list to create a 'row' for our future CSV file.

element = did_elements[6]

# unittitle
try: # Added try/except statements for 'unittitle' and 'unitdate' just to be safe
 unittitle = clean_text(element.find('unittitle').get_text())
except:
 unittitle = ''
 
# unitdate
try:
 unitdate = clean_text(element.find('unitdate').get_text())
except:
 unitdate = ''
 
# container type and number
try:
 container_type = element.find('container')['type']
except:
 container_type = ''

try:
 container_number = element.find('container').get_text()
except:
 container_number = ''

# extent
try:
 extent = element.find('extent').get_text()
except:
 extent = ''

row = [unittitle, unitdate, container_type, container_number, extent]


print(row)

['Philip Morris Playhouse script for "Magnificent Obsession," 1944 January 27', '1944 January 27', 'folder', '4', '']


In [39]:
## Let's take a step back and generalize, so that we can extract metadata for each 
## 'did' element in a single XML file.

## We will also include the 'collection title' field ('titleproper' in EAD's vocabulary) as 
## the first item in each row.

xml_filename = xml_filenames[3] # <-- Change the index number there to run the script on another XML file in the list.


xml_text = open(xml_filename).read()

soup = BeautifulSoup(xml_text, 'lxml')

list_of_lists = [] # Creating an empty list, which we will use to hold our rows (each row represented as a list)


try:
 collection_title = clean_text(soup.find('titleproper').get_text())
except:
 collection_title = xml_filename # If the 'titleproper' field is missing for some reason,
 ## we'll use the XML filename instead.

for element in soup.find_all('did'):

 # unittitle
 try:
 unittitle = clean_text(element.find('unittitle').get_text())
 except:
 unittitle = ''
 
 # unitdate
 try:
 unitdate = clean_text(element.find('unitdate').get_text())
 except:
 unitdate = ''
 
 # container type and number
 try:
 container_type = element.find('container')['type']
 except:
 container_type = ''

 try:
 container_number = element.find('container').get_text()
 except:
 container_number = ''

 # extent
 try:
 extent = element.find('extent').get_text()
 except:
 extent = ''

 row = [collection_title, unittitle, unitdate, container_type, container_number, extent]

 list_of_lists.append(row) ## Adding the row list we defined in the previous line to 'list_of_lists' 


list_of_lists[:15] ## Outputting the first 15 rows in our list of lists

[['Papers from the Jim Walsh Collection',
 'Papers from the Jim Walsh collection 1867-1987, and undated 1913-1985',
 '1867-1987, and undated',
 '',
 '',
 '23.58 linear feet (17 boxes, 1 map case folder,\n\t\t\t\t\tapproximately 12,860 items)'],
 ['Papers from the Jim Walsh Collection',
 'Series 1. Correspondence/Research Files, 1913-1987, and undated',
 '1913-1987, and undated',
 '',
 '',
 '2.94 linear feet'],
 ['Papers from the Jim Walsh Collection', 'Correspondence', '', '', '', ''],
 ['Papers from the Jim Walsh Collection',
 'Brooks, Tim, 1968, 1975',
 '1968, 1975',
 'box',
 '1',
 ''],
 ['Papers from the Jim Walsh Collection',
 'Burt, Leah Brodbeck Stenzel, 1972-1979',
 '1972-1979',
 'box',
 '1',
 ''],
 ['Papers from the Jim Walsh Collection',
 'Columbia Phonograph Co., 1929-1957',
 '1929-1957',
 'box',
 '1',
 ''],
 ['Papers from the Jim Walsh Collection',
 'Crossett, Glenn "Curly," 1948-1958',
 '1948-1958',
 'box',
 '1',
 ''],
 ['Papers from the Jim Walsh Collection',
 'Deakins, Du

In [40]:
## Almost there! Next we'll run the script above on each XML file in our list, creating a 
## master list of lists that we'll write to disk as a CSV in the next cell.

## Let's begin by re-loading our list of XML filenames:

os.chdir('/sharedfolder/LOC_Metadata')
xml_filenames = [item for item in os.listdir('./') if item[-2:]=='.2'] # Creating a list of XML filenames

list_of_lists = [] # Creating an empty list

## Now we'll extract metadata from the full batch of XML files. This may take a few moments to complete.

for xml_filename in xml_filenames:
 
 xml_text = open(xml_filename).read()
 
 soup = BeautifulSoup(xml_text, 'lxml')
 
 try:
 collection_title = clean_text(soup.find('titleproper').get_text())
 except:
 collection_title = xml_filename # If the 'titleproper' field is missing for some reason,
 ## we'll use the XML filename instead.
 
 for element in soup.find_all('did'):
 
 # unittitle
 try:
 unittitle = clean_text(element.find('unittitle').get_text())
 except:
 unittitle = ''
 
 # unitdate
 try:
 unitdate = clean_text(element.find('unitdate').get_text())
 except:
 unitdate = ''
 
 # container type and number
 try:
 container_type = element.find('container')['type']
 except:
 container_type = ''
 
 try:
 container_number = element.find('container').get_text()
 except:
 container_number = ''
 
 # extent
 try:
 extent = element.find('extent').get_text()
 except:
 extent = ''
 
 row = [collection_title, unittitle, unitdate, container_type, container_number, extent]
 
 list_of_lists.append(row)


print(len(list_of_lists)) ## Printing the number of rows in our table

11881


In [41]:
## Finally, we write the extracted metadata to disk as a CSV called 'LOC_RS_Reduced_Metadata.csv'

out_path = "./LOC_RS_Reduced_Metadata.csv" # The './' part is optional; it just means we're writing to 
 # the current working directory.

# Defining a list of column headers, which we will write as the first row in our CSV
column_headers = ['Collection Title', 'Unit Title', 'Unit Date', 'Container Type', 'Container Number', 'Extent']

import csv # Importing Python's built-in CSV input/output package

with open(out_path, 'w') as fo: # Creating a tempory file stream object called 'fo' (my abbreviation for 'file out')
 csv_writer = csv.writer(fo) # Initializing our CSV writer
 csv_writer.writerow(column_headers) # Writing one row (our column headers)
 csv_writer.writerows(list_of_lists) # Writing a list of lists as a sequence of rows

In [None]:
## Go to 'sharedfolder' on your desktop and use LibreOffice or Excel to open your new CSV.

## As you scroll through the CSV file, you will probably see more formatting oddities you can fix 
## by tweaking the code above.