{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scraping and Parsing: EAD XML Finding Aids from the Library of Congress" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "from urllib.request import urlopen\n", "from bs4 import BeautifulSoup\n", "import subprocess" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "## Creating a directory called 'LOC_Metadata' and setting it as our current working directory\n", "\n", "!mkdir /sharedfolder/LOC_Metadata\n", "\n", "os.chdir('/sharedfolder/LOC_Metadata')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " Library of Congress Finding Aids: XML Source Files, Recorded Sound\n", " \n", " \n", " [XML]\n" ] } ], "source": [ "## Now we'll parse the page's HTML using BeautifulSoup ...\n", "\n", "soup = BeautifulSoup(finding_aid_list_page, 'lxml')\n", "\n", "## ... and examine soup.find_all('a'), which returns a list of 'a' elements (i.e., HTML links).\n", "\n", "print(len(soup.find_all('a'))) # Checking the number of links on the page\n", "\n", "print() # Printing a blank line for readability\n", "\n", "print(soup.find_all('a')[70]) # Printing element #70 in the list" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009003.2'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## We can access the 'href' attribute of an element (i.e., the link URL) using 'href' in \n", "## brackets, just like a dictionary.\n", "\n", "soup.find_all('a')[70]['href']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['http://www.loc.gov',\n", " 'http://www.loc.gov/rr/askalib/',\n", " 'http://www.loc.gov/library/libarch-digital.html',\n", " 'http://catalog.loc.gov/',\n", " 'http://www.loc.gov',\n", " 'http://www.loc.gov/rr/',\n", " '/index.html',\n", " '/index.html',\n", " '/index.html',\n", " '/browse/collections/a',\n", " '/browse/dates/main',\n", " '/browse/locations/main',\n", " '/browse/names/a',\n", " '/browse/titles/a',\n", " '/browse/subjects/a']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Now let's make a list of every link on the page.\n", "\n", "all_links = []\n", "\n", "for element in soup.find_all('a'): # Looping through all 'a' elements.\n", " try: # Because some 'a' elements do not contain 'href' attributes, \n", " all_links.append(element['href']) ## we can use a try/except statement to skip elements that \n", " except: ## would otherwise raise an error.\n", " pass\n", "\n", "all_links[:15] # Outputting the first 15 links in the list" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs011001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009003.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs005002.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs004004.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs012002.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs005001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009006.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs008001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs010002.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs008002.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs004002.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs000001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs006002.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs009004.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs010001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs004001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs011002.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs006001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs012001.2',\n", " 'http://hdl.loc.gov/loc.mbrsrs/eadmbrs.rs004003.2']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## We know that the URL for every XML file we're looking for ends in '.2', so we can\n", "## use that fact to filter out irrelevant links.\n", "\n", "xml_urls = []\n", "\n", "for link in all_links:\n", " if link[-2:] == '.2': # Checking whether the last two characters of a link are '.2'\n", " xml_urls.append(link)\n", "\n", "xml_urls # Outputting the full list of relevant XML URLs " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "## Downloading each XML file in our list of URLs\n", "\n", "## We can use the subprocess module (which we imported above) to issue commands in the bash shell.\n", "## In an interactive bash shell session we'd use spaces to separate arguments; instead, subprocess\n", "## takes arguments in the form of a Python list.\n", "\n", "## For each item in our list, the following issues a command with two arguments: 'wget' followed by the URL.\n", "## It thus downloads each XML file to the current directory.\n", "\n", "for url in xml_urls:\n", " subprocess.call(['wget', url])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['eadmbrs.rs000001.2',\n", " 'eadmbrs.rs004001.2',\n", " 'eadmbrs.rs004002.2',\n", " 'eadmbrs.rs004003.2',\n", " 'eadmbrs.rs004004.2',\n", " 'eadmbrs.rs005001.2',\n", " 'eadmbrs.rs005002.2',\n", " 'eadmbrs.rs006001.2',\n", " 'eadmbrs.rs006002.2',\n", " 'eadmbrs.rs008001.2',\n", " 'eadmbrs.rs008002.2',\n", " 'eadmbrs.rs009001.2',\n", " 'eadmbrs.rs009003.2',\n", " 'eadmbrs.rs009004.2',\n", " 'eadmbrs.rs009006.2',\n", " 'eadmbrs.rs010001.2',\n", " 'eadmbrs.rs010002.2',\n", " 'eadmbrs.rs011001.2',\n", " 'eadmbrs.rs011002.2',\n", " 'eadmbrs.rs012001.2',\n", " 'eadmbrs.rs012002.2']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Outputting a list of filenames in the current directory\n", "\n", "## In Unix-like operating systems, './' always refers to the current directory.\n", "\n", "os.listdir('./')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['eadmbrs.rs000001.2',\n", " 'eadmbrs.rs004001.2',\n", " 'eadmbrs.rs004002.2',\n", " 'eadmbrs.rs004003.2',\n", " 'eadmbrs.rs004004.2',\n", " 'eadmbrs.rs005001.2',\n", " 'eadmbrs.rs005002.2',\n", " 'eadmbrs.rs006001.2',\n", " 'eadmbrs.rs006002.2',\n", " 'eadmbrs.rs008001.2',\n", " 'eadmbrs.rs008002.2',\n", " 'eadmbrs.rs009001.2',\n", " 'eadmbrs.rs009003.2',\n", " 'eadmbrs.rs009004.2',\n", " 'eadmbrs.rs009006.2',\n", " 'eadmbrs.rs010001.2',\n", " 'eadmbrs.rs010002.2',\n", " 'eadmbrs.rs011001.2',\n", " 'eadmbrs.rs011002.2',\n", " 'eadmbrs.rs012001.2',\n", " 'eadmbrs.rs012002.2']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Just in case there are other files in the current directory, we can use a \n", "## list comprehension to create a list of filenames that end in '.2' and assign\n", "## it to the variable 'xml_filenames'.\n", "\n", "xml_filenames = [item for item in os.listdir('./') if item[-2:]=='.2']\n", "\n", "xml_filenames" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n", "2\n", "Script for the Frank Sinatra\n", "\t\t\t\t\t\t\t\tShow, 1944 April 26\n", "\n", "\n" ] } ], "source": [ "## OK, that's enough exploring. Let's use soup.find_all() to create a list of 'did' elements. \n", "\n", "did_elements = soup.find_all('did')\n", "\n", "print(len(did_elements)) ## Printing the number of 'did' elements in our list\n", "\n", "print()\n", "\n", "print(did_elements[4]) ## Printing item #4 in the the list" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "5\n", "\n", "Philip Morris Playhouse script for\n", "\t\t\t\t\t\t\t\"Here Comes Mr. Jordan,\" 1944 February\n", "\t\t\t\t\t\t\t11\n", "\n", "\n" ] } ], "source": [ "## Not every 'did' element contains the same fields; different objects are described differently.\n", "\n", "## Try running this cell several times, plugging in other index numbers to compare the way\n", "## different items' records are formatted.\n", "\n", "print(did_elements[7])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "## If you run the cell above several times with different index numbers, you'll notice that the \n", "## first item in the list (index 0) refers to the entire box of records, while the others are \n", "## individual folders or series of folders.\n", "\n", "## To make things more complicated, some items are physically described using 'container' elements \n", "## while others use 'extent' instead. Most appear to include 'unittitle' and 'unitdate'.\n", "\n", "## Our goal is to create a CSV that contains a basic description of each 'unit', or 'did' element,\n", "## in each XML finding aid. For the purposes of this exercise, let's include the following pieces \n", "## of information for each unit, where available:\n", "\n", "#### title of the source collection\n", "#### unittitle\n", "#### unitdate\n", "#### container type\n", "#### container number\n", "#### extent" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Manfred F. DeMartino Collection of CBS\\n\\t\\t\\t\\t\\tRadio Scripts '" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Since each XML finding aid represents a single collection, we'll want to include a column that \n", "## identifies which collection it comes from. By reading through the XML files, we see that each \n", "## has a single element called 'titleproper' that describes the whole collection.\n", "\n", "## Let's create a recipe to extract that text. Here's a first try:\n", "\n", "collection_title = soup.find('titleproper').get_text()\n", "\n", "collection_title" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Manfred F. DeMartino Collection of CBS Radio Scripts '" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## That format is OK, but we should remove the tab and newline characters. Let's try again, using \n", "## the replace() function to replace them with spaces.\n", "\n", "collection_title = soup.find('titleproper').get_text().replace('\\t', ' ').replace('\\n', ' ')\n", "\n", "collection_title" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Manfred F. DeMartino Collection of CBS Radio Scripts'" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## We can add the strip() function to remove the space at the end of the string.\n", "\n", "collection_title = soup.find('titleproper').get_text().replace('\\t', ' ').replace('\\n', ' ').strip()\n", "\n", "collection_title" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Manfred F. DeMartino Collection of CBS Radio Scripts'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## We still have a series of spaces in a row in the middle of the string. We can use a 'while loop' \n", "## to repeatedly replace any occurrence of ' ' (two spaces) with ' ' (one space).\n", "\n", "collection_title = soup.find('titleproper').get_text().replace('\\t', ' ').replace('\\n', ' ').strip()\n", "\n", "while ' ' in collection_title:\n", " collection_title = collection_title.replace(' ', ' ')\n", "\n", "collection_title" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "## Perfect. We'll extract the collection name whenever we open an XML finding aid and include it \n", "## in each CSV row associated with that collection." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Script for the Frank Sinatra\\n\\t\\t\\t\\t\\t\\t\\t\\tShow, 1944 April 26\\n'" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Now on to 'unittitle'. Recall that we created a list of 'did' elements above, called 'did_elements'.\n", "\n", "element = did_elements[4]\n", "\n", "unittitle = element.find('unittitle').get_text()\n", "\n", "unittitle" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "## Since those tabs and newlines are a recurring probem, we should define a function that \n", "## removes them from any given text string.\n", "\n", "def clean_text(text):\n", " temp_text = text.replace('\\t', ' ').replace('\\n', ' ').strip()\n", " while ' ' in temp_text:\n", " temp_text = temp_text.replace(' ', ' ')\n", " return temp_text" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Script for the Frank Sinatra Show, 1944 April 26'" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's test our clean_text() function.\n", "\n", "element = did_elements[4]\n", "\n", "unittitle = element.find('unittitle').get_text()\n", "\n", "unittitle = clean_text(unittitle)\n", "\n", "unittitle" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collection Summary Manfred F. DeMartino Collection of CBS Radio Scripts 1943-1945 De Martino, Manfred F. .42 linear feet (1 box) Collection materials are in English Recorded Sound Reference Center, Motion Picture, Broadcasting and Recorded Sound Division Library of Congress Washington, D.C. Scripts and a photograph acquired by Manfred F. DeMartino while working backstage at CBS radio during the mid-1940s. Includes scripts for the Frank Sinatra Show, Philip Morris Playhouse, and Your Hit Parade. RPA 00189\n", "-----------------\n", "Series 1. Photograph, undated 1 folder\n", "-----------------\n", "1 Autographed photograph of Philip Morris spokesman Johnny Roventini, undated\n", "-----------------\n", "Series 2. Scripts, 1943-1945 8 folders\n", "-----------------\n", "2 Script for the Frank Sinatra Show, 1944 April 26\n", "-----------------\n", "3 Script for the Frank Sinatra Show, 1944 December 4\n", "-----------------\n", "4 Philip Morris Playhouse script for \"Magnificent Obsession,\" 1944 January 27\n", "-----------------\n", "5 Philip Morris Playhouse script for \"Here Comes Mr. Jordan,\" 1944 February 11\n", "-----------------\n", "6 Philip Morris Playhouse script for \"The Lodger,\" 1944 February 18\n", "-----------------\n", "7 Your Hit Parade script, 1943 October 16\n", "-----------------\n", "8 Your Hit Parade script, 1944 April 8\n", "-----------------\n", "9 Your Hit Parade script, 1945 August 25\n", "-----------------\n" ] } ], "source": [ "## Now let's try extracting the 'unittitle' field for each 'did' element in our list.\n", "\n", "for element in did_elements:\n", " unittitle = element.get_text().replace('\\t', ' ').replace('\\n', ' ').strip()\n", " print(clean_text(unittitle))\n", " print('-----------------') # Printing a divider between elements" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1944 April 26'" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## The first element in the list above contains more information than we need, but we can\n", "## let that slide for this exercise.\n", "\n", "## Next is 'unitdate'. We'll use our clean_text() function once again.\n", "\n", "element = did_elements[4]\n", "\n", "unitdate = element.find('unitdate').get_text()\n", "\n", "unitdate = clean_text(unitdate)\n", "\n", "unitdate" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1943-1945\n", "-----------------\n", "undated\n", "-----------------\n", "undated\n", "-----------------\n", "1943-1945\n", "-----------------\n", "1944 April 26\n", "-----------------\n", "1944 December 4\n", "-----------------\n", "1944 January 27\n", "-----------------\n", "1944 February 11\n", "-----------------\n", "1944 February 18\n", "-----------------\n", "1943 October 16\n", "-----------------\n", "1944 April 8\n", "-----------------\n", "1945 August 25\n", "-----------------\n" ] } ], "source": [ "## Let's loop through the list of 'did' elements and see if our 'unittitle' recipe holds up.\n", "\n", "for element in did_elements:\n", " unitdate = element.find('unitdate').get_text()\n", " print(clean_text(unitdate))\n", " print('-----------------') # Printing a divider between elements" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Now on to container type and number. Let's examine a 'container' XML element.\n", "\n", "element = did_elements[4]\n", "\n", "element.find('container')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'folder'" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Since the container type ('folder', in this case) is an attribute in the 'container' tag, \n", "## we can extract it using bracket notation.\n", "\n", "element = did_elements[4]\n", "\n", "container_type = element.find('container')['type']\n", "\n", "container_type" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2'" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## The container number is specified between the opening and closing 'container' tags, \n", "## so we can get it using get_text().\n", "\n", "element = did_elements[4]\n", "\n", "container_number = element.find('container').get_text()\n", "\n", "container_number" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "'NoneType' object is not subscriptable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0melement\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdid_elements\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mcontainer_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0melement\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'container'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'type'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcontainer_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable" ] } ], "source": [ "## Next we'll try to get the container type and number for each 'did' element in our list ...\n", "\n", "for element in did_elements:\n", " container_type = element.find('container')['type']\n", " print(container_type)\n", "\n", " container_number = element.find('container').get_text()\n", " print(container_number)\n", "\n", " print('-----------------') # Printing a divider between elements\n", "\n", "## ... and we get an error. The reason is that some 'did' elements don't include a 'container' field." ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "-----------------\n", "\n", "\n", "-----------------\n", "folder\n", "1\n", "-----------------\n", "\n", "\n", "-----------------\n", "folder\n", "2\n", "-----------------\n", "folder\n", "3\n", "-----------------\n", "folder\n", "4\n", "-----------------\n", "folder\n", "5\n", "-----------------\n", "folder\n", "6\n", "-----------------\n", "folder\n", "7\n", "-----------------\n", "folder\n", "8\n", "-----------------\n", "folder\n", "9\n", "-----------------\n" ] } ], "source": [ "## Using try/accept notation, whenever we get an error because a container element isn't found,\n", "## we can revert to '' (an empty string) instead.\n", "\n", "for element in did_elements:\n", " try:\n", " container_type = element.find('container')['type']\n", " except:\n", " container_type = ''\n", " print(container_type)\n", " \n", " try:\n", " container_number = element.find('container').get_text()\n", " except:\n", " container_number = ''\n", " print(container_number)\n", " print('-----------------') # Printing a divider between elements" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'8 folders'" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## The last field we'll extract is 'extent', which is only included in a handful of 'did' elements.\n", "\n", "element = did_elements[3]\n", "\n", "extent = element.find('extent').get_text()\n", "\n", "extent" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ".42 linear feet (1 box)\n", "-----------------\n", "1 folder\n", "-----------------\n", "\n", "-----------------\n", "8 folders\n", "-----------------\n", "\n", "-----------------\n", "\n", "-----------------\n", "\n", "-----------------\n", "\n", "-----------------\n", "\n", "-----------------\n", "\n", "-----------------\n", "\n", "-----------------\n", "\n", "-----------------\n" ] } ], "source": [ "## Let's extract 'extent' from each element in our list of 'did' elements (for those that happen to include it).\n", "\n", "for element in did_elements:\n", " try:\n", " extent = element.find('extent').get_text()\n", " except:\n", " extent = ''\n", " print(extent)\n", " print('-----------------') # Printing a divider between elements" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Philip Morris Playhouse script for \"Magnificent Obsession,\" 1944 January 27', '1944 January 27', 'folder', '4', '']\n" ] } ], "source": [ "## Let's put it all together and view our chosen fields for a single 'did' element.\n", "## We will combine our fields in a list to create a 'row' for our future CSV file.\n", "\n", "element = did_elements[6]\n", "\n", "# unittitle\n", "try: # Added try/except statements for 'unittitle' and 'unitdate' just to be safe\n", " unittitle = clean_text(element.find('unittitle').get_text())\n", "except:\n", " unittitle = ''\n", " \n", "# unitdate\n", "try:\n", " unitdate = clean_text(element.find('unitdate').get_text())\n", "except:\n", " unitdate = ''\n", " \n", "# container type and number\n", "try:\n", " container_type = element.find('container')['type']\n", "except:\n", " container_type = ''\n", "\n", "try:\n", " container_number = element.find('container').get_text()\n", "except:\n", " container_number = ''\n", "\n", "# extent\n", "try:\n", " extent = element.find('extent').get_text()\n", "except:\n", " extent = ''\n", "\n", "row = [unittitle, unitdate, container_type, container_number, extent]\n", "\n", "\n", "print(row)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[['Papers from the Jim Walsh Collection',\n", " 'Papers from the Jim Walsh collection 1867-1987, and undated 1913-1985',\n", " '1867-1987, and undated',\n", " '',\n", " '',\n", " '23.58 linear feet (17 boxes, 1 map case folder,\\n\\t\\t\\t\\t\\tapproximately 12,860 items)'],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Series 1. Correspondence/Research Files, 1913-1987, and undated',\n", " '1913-1987, and undated',\n", " '',\n", " '',\n", " '2.94 linear feet'],\n", " ['Papers from the Jim Walsh Collection', 'Correspondence', '', '', '', ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Brooks, Tim, 1968, 1975',\n", " '1968, 1975',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Burt, Leah Brodbeck Stenzel, 1972-1979',\n", " '1972-1979',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Columbia Phonograph Co., 1929-1957',\n", " '1929-1957',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Crossett, Glenn \"Curly,\" 1948-1958',\n", " '1948-1958',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Deakins, Duane D., 1953-1961',\n", " '1953-1961',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Debus, Allen, 1942-1985',\n", " '1942-1985',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Dethlefson, Ronald, 1981',\n", " '1981',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Dorian, Frank, 1930-1934',\n", " '1930-1934',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Edison, Thomas A. Co., 1923-1975',\n", " '1923-1975',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Fargo, Milford, 1957-1961',\n", " '1957-1961',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Fasset, Stephen, 1940-1948, and undated',\n", " '1940-1948, and undated',\n", " 'box',\n", " '1',\n", " ''],\n", " ['Papers from the Jim Walsh Collection',\n", " 'Favia-Artsay, Aida, 1953-1962',\n", " '1953-1962',\n", " 'box',\n", " '1',\n", " '']]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## Let's take a step back and generalize, so that we can extract metadata for each \n", "## 'did' element in a single XML file.\n", "\n", "## We will also include the 'collection title' field ('titleproper' in EAD's vocabulary) as \n", "## the first item in each row.\n", "\n", "xml_filename = xml_filenames[3] # <-- Change the index number there to run the script on another XML file in the list.\n", "\n", "\n", "xml_text = open(xml_filename).read()\n", "\n", "soup = BeautifulSoup(xml_text, 'lxml')\n", "\n", "list_of_lists = [] # Creating an empty list, which we will use to hold our rows (each row represented as a list)\n", "\n", "\n", "try:\n", " collection_title = clean_text(soup.find('titleproper').get_text())\n", "except:\n", " collection_title = xml_filename # If the 'titleproper' field is missing for some reason,\n", " ## we'll use the XML filename instead.\n", "\n", "for element in soup.find_all('did'):\n", "\n", " # unittitle\n", " try:\n", " unittitle = clean_text(element.find('unittitle').get_text())\n", " except:\n", " unittitle = ''\n", " \n", " # unitdate\n", " try:\n", " unitdate = clean_text(element.find('unitdate').get_text())\n", " except:\n", " unitdate = ''\n", " \n", " # container type and number\n", " try:\n", " container_type = element.find('container')['type']\n", " except:\n", " container_type = ''\n", "\n", " try:\n", " container_number = element.find('container').get_text()\n", " except:\n", " container_number = ''\n", "\n", " # extent\n", " try:\n", " extent = element.find('extent').get_text()\n", " except:\n", " extent = ''\n", "\n", " row = [collection_title, unittitle, unitdate, container_type, container_number, extent]\n", "\n", " list_of_lists.append(row) ## Adding the row list we defined in the previous line to 'list_of_lists' \n", "\n", "\n", "list_of_lists[:15] ## Outputting the first 15 rows in our list of lists" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11881\n" ] } ], "source": [ "## Almost there! Next we'll run the script above on each XML file in our list, creating a \n", "## master list of lists that we'll write to disk as a CSV in the next cell.\n", "\n", "## Let's begin by re-loading our list of XML filenames:\n", "\n", "os.chdir('/sharedfolder/LOC_Metadata')\n", "xml_filenames = [item for item in os.listdir('./') if item[-2:]=='.2'] # Creating a list of XML filenames\n", "\n", "list_of_lists = [] # Creating an empty list\n", "\n", "## Now we'll extract metadata from the full batch of XML files. This may take a few moments to complete.\n", "\n", "for xml_filename in xml_filenames:\n", " \n", " xml_text = open(xml_filename).read()\n", " \n", " soup = BeautifulSoup(xml_text, 'lxml')\n", " \n", " try:\n", " collection_title = clean_text(soup.find('titleproper').get_text())\n", " except:\n", " collection_title = xml_filename # If the 'titleproper' field is missing for some reason,\n", " ## we'll use the XML filename instead.\n", " \n", " for element in soup.find_all('did'):\n", " \n", " # unittitle\n", " try:\n", " unittitle = clean_text(element.find('unittitle').get_text())\n", " except:\n", " unittitle = ''\n", " \n", " # unitdate\n", " try:\n", " unitdate = clean_text(element.find('unitdate').get_text())\n", " except:\n", " unitdate = ''\n", " \n", " # container type and number\n", " try:\n", " container_type = element.find('container')['type']\n", " except:\n", " container_type = ''\n", " \n", " try:\n", " container_number = element.find('container').get_text()\n", " except:\n", " container_number = ''\n", " \n", " # extent\n", " try:\n", " extent = element.find('extent').get_text()\n", " except:\n", " extent = ''\n", " \n", " row = [collection_title, unittitle, unitdate, container_type, container_number, extent]\n", " \n", " list_of_lists.append(row)\n", "\n", "\n", "print(len(list_of_lists)) ## Printing the number of rows in our table" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "## Finally, we write the extracted metadata to disk as a CSV called 'LOC_RS_Reduced_Metadata.csv'\n", "\n", "out_path = \"./LOC_RS_Reduced_Metadata.csv\" # The './' part is optional; it just means we're writing to \n", " # the current working directory.\n", "\n", "# Defining a list of column headers, which we will write as the first row in our CSV\n", "column_headers = ['Collection Title', 'Unit Title', 'Unit Date', 'Container Type', 'Container Number', 'Extent']\n", "\n", "import csv # Importing Python's built-in CSV input/output package\n", "\n", "with open(out_path, 'w') as fo: # Creating a tempory file stream object called 'fo' (my abbreviation for 'file out')\n", " csv_writer = csv.writer(fo) # Initializing our CSV writer\n", " csv_writer.writerow(column_headers) # Writing one row (our column headers)\n", " csv_writer.writerows(list_of_lists) # Writing a list of lists as a sequence of rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "## Go to 'sharedfolder' on your desktop and use LibreOffice or Excel to open your new CSV.\n", "\n", "## As you scroll through the CSV file, you will probably see more formatting oddities you can fix \n", "## by tweaking the code above." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }