{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get a list of species records from the Museums Victoria collection\n",
    "\n",
    "The Museums Victoria collection API accepts four `recordtype` values: 'article', 'item', 'species', and 'specimen'. In this notebook we'll build a simple harvester to download all the 'species' records.\n",
    "\n",
    "See the Museums Victoria [collection API documentation](https://collections.museumsvictoria.com.au/developers) for more information."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import what we need"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from tqdm.auto import tqdm\n",
    "import pandas as pd\n",
    "from IPython.display import display, FileLink"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Base search url\n",
    "SEARCH_URL = 'https://collections.museumsvictoria.com.au/api/search'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define some functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_totals(params):\n",
    "    '''\n",
    "    Get the total results and pages from a search.\n",
    "    '''\n",
    "    response = requests.get(SEARCH_URL, params=params, headers={'User-Agent': 'Mozilla/5.0'})\n",
    "    # The total results and pages values are in the API response's headers!\n",
    "    total_results = int(response.headers['Total-Results'])\n",
    "    total_pages = int(response.headers['Total-Pages'])\n",
    "    return (total_results, total_pages)\n",
    "\n",
    "def harvest_species():\n",
    "    '''\n",
    "    Download all the species records, saving the record id, taxon name, and common name.\n",
    "    Returns a list of species.\n",
    "    '''\n",
    "    species = []\n",
    "    params = {\n",
    "        'query': ' ',\n",
    "        'recordtype': 'species',\n",
    "        'sort': 'date',\n",
    "        'perpage': 100\n",
    "    }\n",
    "    total_results, total_pages = get_totals(params)\n",
    "    # Loop through the total pages, downloading a page of results at a time\n",
    "    for page in tqdm(range(1, total_pages + 1)):\n",
    "        # Update the page value\n",
    "        params['page'] = page\n",
    "        # Make a request to the API\n",
    "        response = requests.get(SEARCH_URL, params=params, headers={'User-Agent': 'Mozilla/5.0'})\n",
    "        # Loop through the results\n",
    "        for record in response.json():\n",
    "            # Look for the taxonomy section of the record\n",
    "            taxonomy = record['taxonomy']\n",
    "            if taxonomy:\n",
    "                # Save species info\n",
    "                species.append({'id': record['id'], 'taxon_name': taxonomy['taxonName'], 'common_name': taxonomy['commonName']})\n",
    "    return species"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Harvest the records!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "species = harvest_species()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert to a dataframe and save as a CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>taxon_name</th>\n",
       "      <th>common_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>species/8583</td>\n",
       "      <td>Melangyna viridiceps</td>\n",
       "      <td>Common Hover Fly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>species/8307</td>\n",
       "      <td>Tetractenos glaber</td>\n",
       "      <td>Smooth Toadfish</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>species/8815</td>\n",
       "      <td>Salticidae</td>\n",
       "      <td>Jumping Spider</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>species/8456</td>\n",
       "      <td>Hydromys chrysogaster</td>\n",
       "      <td>Common Water Rat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>species/12377</td>\n",
       "      <td>Dromaius novaehollandiae</td>\n",
       "      <td>Emu</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              id                taxon_name       common_name\n",
       "0   species/8583      Melangyna viridiceps  Common Hover Fly\n",
       "1   species/8307        Tetractenos glaber   Smooth Toadfish\n",
       "2   species/8815                Salticidae    Jumping Spider\n",
       "3   species/8456     Hydromys chrysogaster  Common Water Rat\n",
       "4  species/12377  Dromaius novaehollandiae               Emu"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(species)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many species are recorded in the Museum of Victoria collection?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1411, 3)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save the list as a CSV file so we can make use of it elsewhere"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<a href='museum-victoria-species.csv' target='_blank'>museum-victoria-species.csv</a><br>"
      ],
      "text/plain": [
       "/Volumes/Workspace/mycode/glam-workbench/museums-victoria/notebooks/museum-victoria-species.csv"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df.to_csv('museum-victoria-species.csv', index=False)\n",
    "display(FileLink('museum-victoria-species.csv'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/).  Support me by becoming a [GitHub sponsor](https://github.com/sponsors/wragge)!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}