{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Harvest agencies associated with *all* functions\n", "\n", "This notebook loops through the list of functions that were [extracted from the RecordSearch interface](harvesting_functions_from_recordsearch.ipynb) and saves basic details of the agencies responsible for each function. To keep down the file size and avoid too much duplication it doesn't include the full range of relationships that an agency might have. If you want the full agency data, use [this notebook](get_agencies_associated_with_function.ipynb) to harvest agencies associated with an indivividual function or hierarchy.\n", "\n", "The JSON data file created has the following structure:\n", "\n", "``` json\n", "[\n", " {\n", " 'term': FUNCTION NAME\n", " 'agencies': [\n", " 'agency_id': AGENCY IDENTIFIER,\n", " 'title': AGENCY NAME,\n", " 'dates': {\n", " 'date_str': AGENCY LIFE DATES AS A STRING,\n", " 'start_date': AGENCY START DATE (YYYY-MM-DD),\n", " 'end_date': AGENCY END DATE (YYYY-MM-DD),\n", " },\n", " 'agency_status': TYPE/LEVEL OF AGENCY,\n", " 'location': AGENCY LOCATION,\n", " 'function_start_date': DATE AGENCY STARTED BEING RESPONSIBLE FOR THIS FUNCTION (YYYY-MM-DD),\n", " 'function_end_date': DATE AGENCY STOPPED BEING RESPONSIBLE FOR THIS FUNCTION (YYYY-MM-DD),\n", " ]\n", " }\n", "]\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set up the harvesting code" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import math\n", "import time\n", "import ipywidgets as widgets\n", "from IPython.display import display, HTML, FileLink, clear_output\n", "import pandas as pd\n", "from tqdm import tqdm_notebook\n", "from slugify import slugify\n", "from tinydb import TinyDB, Query\n", "from recordsearch_tools.client import RSAgencySearchClient\n", "from recordsearch_tools.utilities import retry\n", "import arrow" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class AgencyHarvester(object):\n", " '''\n", " Searches for agencies associated with a particular function.\n", " Loops through pages in the results set saving agency details.\n", " '''\n", " \n", " def __init__(self, function):\n", " self.function = function\n", " self.total_pages = 0\n", " self.total_results = 0\n", " self.agencies = []\n", " self.client = RSAgencySearchClient()\n", " self.prepare_harvest()\n", "\n", " @retry(ConnectionError, tries=20, delay=10, backoff=1)\n", " def prepare_harvest(self):\n", " '''\n", " Finds the number of results and calculates how many pages need to be harvested.\n", " '''\n", " # Setting results_per_page to zero makes things much faster\n", " response = self.client.search_agencies(function=self.function, sort='1', results_per_page=0)\n", " total_results = self.client.total_results\n", " print('{} agencies'.format(total_results if total_results else 'No'))\n", " if total_results:\n", " self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1\n", "\n", " @retry(ConnectionError, tries=20, delay=10, backoff=1)\n", " def start_harvest(self, start=1):\n", " '''\n", " Loop through each page of results saving the results.\n", " '''\n", " if self.total_pages:\n", " for page in tqdm_notebook(range(start, self.total_pages + 1), unit='page', desc='Pages:'):\n", " response = self.client.search_agencies(function=self.function, page=page, sort='1', date_format='iso')\n", " self.agencies += response['results']\n", " time.sleep(1)\n", " \n", "def get_children(function):\n", " '''\n", " Gets child terms of a given function.\n", " '''\n", " f_list = []\n", " if 'narrower' in function:\n", " for subf in function['narrower']:\n", " f_list.append(subf['term'])\n", " f_list += get_children(subf)\n", " return f_list\n", "\n", "def load_functions():\n", " '''\n", " Loads a pre-harvested JSON file containing functions data.\n", " Returns a flat list of functions.\n", " '''\n", " functions_list = []\n", " with open('data/functions.json', 'r') as json_file:\n", " functions = json.load(json_file)\n", " for function in functions:\n", " functions_list.append(function['term'])\n", " functions_list += get_children(function)\n", " # Get rid of duplicates\n", " functions_list = set(functions_list)\n", " # Sort terms\n", " functions_list = sorted(functions_list)\n", " return functions_list\n", "\n", "def get_function_dates(function, agency):\n", " '''\n", " Get the dates an agency was responsible for a given function.\n", " '''\n", " dates = {}\n", " # Loop through the functions associated with an agency\n", " for f in agency['functions']:\n", " # Find the current function\n", " if f['identifier'].lower() == function:\n", " # Get the dates this agency was responsible for the current function\n", " dates['function_start_date'] = f['start_date']\n", " dates['function_end_date'] = f['end_date']\n", " break\n", " return dates\n", "\n", "def get_all_agencies():\n", " '''\n", " Sends function terms off to the harvester to get related agencies.\n", " '''\n", " clear_output()\n", " Record = Query()\n", " # Get a list of functions\n", " functions = load_functions()\n", " db = TinyDB('data/db_agencies_by_function')\n", " # Loop through the list of functions\n", " for function in functions:\n", " clear_output()\n", " print('\\nHarvesting \"{}\"'.format(function))\n", " # Fire up the harvester for this function\n", " client = AgencyHarvester(function=function)\n", " client.start_harvest()\n", " agencies = []\n", " # Create a subset of the agency data to limit the filesize\n", " for a in client.agencies:\n", " # Keep the fields we want\n", " agency = {k: a[k] for k in ['agency_id', 'title', 'dates', 'agency_status', 'location']}\n", " # Add extra fields to show when the agency was responsible for this function\n", " agency.update(get_function_dates(function, a))\n", " agencies.append(agency)\n", " db.upsert({'term': function, 'agencies': agencies}, Record.term == function)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Start the harvest" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Harvesting \"zoology\"\n", "No agencies\n" ] } ], "source": [ "get_all_agencies()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save the results for download" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def save_json():\n", " db = TinyDB('data/db_agencies_by_function')\n", " functions = db.all()\n", " filename = 'data/agencies_by_function.json'\n", " with open(filename, 'w') as json_file:\n", " json.dump(functions, json_file, indent=4)\n", " display(FileLink(filename))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "data/agencies_by_function.json
" ], "text/plain": [ "/Users/tim/mycode/glam-workbench/recordsearch/notebooks/data/agencies_by_function.json" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "save_json()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }