{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get a list of species records from the Museums Victoria collection\n", "\n", "The Museums Victoria collection API accepts four `recordtype` values: 'article', 'item', 'species', and 'specimen'. In this notebook we'll build a simple harvester to download all the 'species' records.\n", "\n", "See the Museums Victoria [collection API documentation](https://collections.museumsvictoria.com.au/developers) for more information." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import what we need" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from tqdm.auto import tqdm\n", "import pandas as pd\n", "from IPython.display import display, FileLink" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Base search url\n", "SEARCH_URL = 'https://collections.museumsvictoria.com.au/api/search'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define some functions" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_totals(params):\n", " '''\n", " Get the total results and pages from a search.\n", " '''\n", " response = requests.get(SEARCH_URL, params=params, headers={'User-Agent': 'Mozilla/5.0'})\n", " # The total results and pages values are in the API response's headers!\n", " total_results = int(response.headers['Total-Results'])\n", " total_pages = int(response.headers['Total-Pages'])\n", " return (total_results, total_pages)\n", "\n", "def harvest_species():\n", " '''\n", " Download all the species records, saving the record id, taxon name, and common name.\n", " Returns a list of species.\n", " '''\n", " species = []\n", " params = {\n", " 'query': ' ',\n", " 'recordtype': 'species',\n", " 'sort': 'date',\n", " 'perpage': 100\n", " }\n", " total_results, total_pages = get_totals(params)\n", " # Loop through the total pages, downloading a page of results at a time\n", " for page in tqdm(range(1, total_pages + 1)):\n", " # Update the page value\n", " params['page'] = page\n", " # Make a request to the API\n", " response = requests.get(SEARCH_URL, params=params, headers={'User-Agent': 'Mozilla/5.0'})\n", " # Loop through the results\n", " for record in response.json():\n", " # Look for the taxonomy section of the record\n", " taxonomy = record['taxonomy']\n", " if taxonomy:\n", " # Save species info\n", " species.append({'id': record['id'], 'taxon_name': taxonomy['taxonName'], 'common_name': taxonomy['commonName']})\n", " return species" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Harvest the records!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "species = harvest_species()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert to a dataframe and save as a CSV" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtaxon_namecommon_name
0species/8583Melangyna viridicepsCommon Hover Fly
1species/8307Tetractenos glaberSmooth Toadfish
2species/8815SalticidaeJumping Spider
3species/8456Hydromys chrysogasterCommon Water Rat
4species/12377Dromaius novaehollandiaeEmu
\n", "
" ], "text/plain": [ " id taxon_name common_name\n", "0 species/8583 Melangyna viridiceps Common Hover Fly\n", "1 species/8307 Tetractenos glaber Smooth Toadfish\n", "2 species/8815 Salticidae Jumping Spider\n", "3 species/8456 Hydromys chrysogaster Common Water Rat\n", "4 species/12377 Dromaius novaehollandiae Emu" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(species)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many species are recorded in the Museum of Victoria collection?" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1411, 3)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Save the list as a CSV file so we can make use of it elsewhere" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "museum-victoria-species.csv
" ], "text/plain": [ "/Volumes/Workspace/mycode/glam-workbench/museums-victoria/notebooks/museum-victoria-species.csv" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.to_csv('museum-victoria-species.csv', index=False)\n", "display(FileLink('museum-victoria-species.csv'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). Support me by becoming a [GitHub sponsor](https://github.com/sponsors/wragge)!" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }