{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get a list of species records from the Museums Victoria collection\n", "\n", "The Museums Victoria collection API accepts four `recordtype` values: 'article', 'item', 'species', and 'specimen'. In this notebook we'll build a simple harvester to download all the 'species' records.\n", "\n", "See the Museums Victoria [collection API documentation](https://collections.museumsvictoria.com.au/developers) for more information." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import what we need" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from tqdm.auto import tqdm\n", "import pandas as pd\n", "from IPython.display import display, FileLink" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Base search url\n", "SEARCH_URL = 'https://collections.museumsvictoria.com.au/api/search'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define some functions" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_totals(params):\n", " '''\n", " Get the total results and pages from a search.\n", " '''\n", " response = requests.get(SEARCH_URL, params=params, headers={'User-Agent': 'Mozilla/5.0'})\n", " # The total results and pages values are in the API response's headers!\n", " total_results = int(response.headers['Total-Results'])\n", " total_pages = int(response.headers['Total-Pages'])\n", " return (total_results, total_pages)\n", "\n", "def harvest_species():\n", " '''\n", " Download all the species records, saving the record id, taxon name, and common name.\n", " Returns a list of species.\n", " '''\n", " species = []\n", " params = {\n", " 'query': ' ',\n", " 'recordtype': 'species',\n", " 'sort': 'date',\n", " 'perpage': 100\n", " }\n", " total_results, total_pages = get_totals(params)\n", " # Loop through the total pages, downloading a page of results at a time\n", " for page in tqdm(range(1, total_pages + 1)):\n", " # Update the page value\n", " params['page'] = page\n", " # Make a request to the API\n", " response = requests.get(SEARCH_URL, params=params, headers={'User-Agent': 'Mozilla/5.0'})\n", " # Loop through the results\n", " for record in response.json():\n", " # Look for the taxonomy section of the record\n", " taxonomy = record['taxonomy']\n", " if taxonomy:\n", " # Save species info\n", " species.append({'id': record['id'], 'taxon_name': taxonomy['taxonName'], 'common_name': taxonomy['commonName']})\n", " return species" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Harvest the records!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "species = harvest_species()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert to a dataframe and save as a CSV" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id | \n", "taxon_name | \n", "common_name | \n", "
---|---|---|---|
0 | \n", "species/8583 | \n", "Melangyna viridiceps | \n", "Common Hover Fly | \n", "
1 | \n", "species/8307 | \n", "Tetractenos glaber | \n", "Smooth Toadfish | \n", "
2 | \n", "species/8815 | \n", "Salticidae | \n", "Jumping Spider | \n", "
3 | \n", "species/8456 | \n", "Hydromys chrysogaster | \n", "Common Water Rat | \n", "
4 | \n", "species/12377 | \n", "Dromaius novaehollandiae | \n", "Emu | \n", "