{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Harvest GLAM datasets from data.gov.au\n",
    "\n",
    "**Because of problems with duplicate records in data.gov.au, I no longer use this notebook to harvest details of GLAM datasets. See [GLAM data from government portals](glam_data_from_gov_portals.ipynb) instead.**\n",
    "\n",
    "This is a quick attempt to harvest datasets published by GLAM institutions using the new [data.gov.au API](https://data.gov.au/api/v0/apidocs/index.html).\n",
    "\n",
    "To create the list of organisations, I searched the organisations on the [data.gov.au site](https://data.gov.au/) for 'library', 'archives', 'records', and 'museum'. I noticed that Queensland State Archives isn't included as an organisation, even though it's used as a tag, so I added it in as a query. There are inconsistencies in the way organisations are listed, so it's possible I've missed some."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import requests\n",
    "from IPython.display import FileLink, display\n",
    "from requests.adapters import HTTPAdapter\n",
    "from requests.packages.urllib3.util.retry import Retry\n",
    "from slugify import slugify\n",
    "\n",
    "s = requests.Session()\n",
    "retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n",
    "s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n",
    "s.mount(\"https://\", HTTPAdapter(max_retries=retries))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "api_url = \"https://data.gov.au/api/v0/search/datasets\"\n",
    "organisations = [\n",
    "    \"NSW State Archives\",\n",
    "    \"National Archives of Australia\",\n",
    "    \"Libraries Tasmania\",\n",
    "    \"State Records\",\n",
    "    \"State Records Office of Western Australia\",\n",
    "    \"State Library of Victoria\",\n",
    "    \"State Library of NSW\",\n",
    "    \"Mount Gambier Library\",\n",
    "    \"National Library of Australia\",\n",
    "    \"State Library of Queensland\",\n",
    "    \"State Library of Western Australia\",\n",
    "    \"State Library of South Australia\",\n",
    "    \"State Library of New South Wales\",\n",
    "    \"Western Australian Museum\",\n",
    "    \"South Australian Museum\",\n",
    "    \"Museum of Applied Arts and Sciences\",\n",
    "    \"Tasmanian Museum and Art Gallery\",\n",
    "    \"History Trust of South Australia\",\n",
    "    \"Australian Institute of Aboriginal and Torres Strait Islander Studies (AIATSIS)\",\n",
    "    \"National Portrait Gallery\",\n",
    "    \"Australian Museum\",\n",
    "]\n",
    "# No entries under organisations\n",
    "queries = ['\"Queensland State Archives\"', \"PROV Public Record Office\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def safe_get(dct, *keys):\n",
    "    for key in keys:\n",
    "        try:\n",
    "            dct = dct[key]\n",
    "        except (KeyError, TypeError):\n",
    "            return None\n",
    "    return dct\n",
    "\n",
    "\n",
    "def process_dataset(dataset, query=None):\n",
    "    datafiles = []\n",
    "    for dist in dataset[\"distributions\"]:\n",
    "        if query:\n",
    "            publisher = query.strip('\"')\n",
    "        else:\n",
    "            publisher = safe_get(dataset, \"publisher\", \"name\")\n",
    "        datafile = {\n",
    "            \"dataset_title\": safe_get(dataset, \"title\"),\n",
    "            \"publisher\": publisher,\n",
    "            \"dataset_issued\": safe_get(dataset, \"issued\"),\n",
    "            \"dataset_modified\": safe_get(dataset, \"modified\"),\n",
    "            \"dataset_description\": safe_get(dataset, \"description\"),\n",
    "            \"source\": safe_get(dataset, \"catalog\"),\n",
    "            \"info_url\": safe_get(dataset, \"landingPage\"),\n",
    "            \"start_date\": safe_get(dataset, \"temporal\", \"start\", \"date\"),\n",
    "            \"end_date\": safe_get(dataset, \"temporal\", \"end\", \"date\"),\n",
    "            \"file_title\": safe_get(dist, \"title\"),\n",
    "            \"download_url\": safe_get(dist, \"downloadURL\"),\n",
    "            \"format\": safe_get(dist, \"format\"),\n",
    "            \"file_description\": safe_get(dist, \"description\"),\n",
    "            \"file_issued\": safe_get(dist, \"issued\"),\n",
    "            \"file_modified\": safe_get(dist, \"modified\"),\n",
    "            \"licence\": safe_get(dist, \"license\", \"name\"),\n",
    "        }\n",
    "        datafiles.append(datafile)\n",
    "    return datafiles\n",
    "\n",
    "\n",
    "def harvest_datasets():\n",
    "    datafiles = []\n",
    "    for organisation in organisations:\n",
    "        response = s.get(api_url, params={\"publisher\": organisation, \"limit\": 100})\n",
    "        print(response.url)\n",
    "        data = response.json()\n",
    "        for dataset in data[\"dataSets\"]:\n",
    "            datafiles += process_dataset(dataset)\n",
    "    for query in queries:\n",
    "        response = s.get(api_url, params={\"query\": query, \"limit\": 100})\n",
    "        print(response.url)\n",
    "        data = response.json()\n",
    "        for dataset in data[\"dataSets\"]:\n",
    "            datafiles += process_dataset(dataset, query=query)\n",
    "    return datafiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datafiles = harvest_datasets()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(datafiles)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"format\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"licence\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"publisher\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"glam_datasets_all_formats_from_datagovau.csv\", index=False)\n",
    "display(FileLink(\"glam_datasets_all_formats_from_datagovau.csv\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "csvs = df.loc[df[\"format\"] == \"CSV\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "csvs.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "csvs[\"publisher\"].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "csvs.to_csv(\"glam_datasets_csvs_from_datagovau.csv\", index=False)\n",
    "display(FileLink(\"glam_datasets_csvs_from_datagovau.csv\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write results to a markdown file\n",
    "\n",
    "orgs = df.sort_values(by=[\"publisher\", \"dataset_title\", \"dataset_modified\"]).groupby(\n",
    "    \"publisher\"\n",
    ")\n",
    "with open(\"glam_datasets_from_datagovau.md\", \"w\") as md_file:\n",
    "    for org, group in orgs:\n",
    "        print(\"* [{}](#{})\".format(org, slugify(org)))\n",
    "        md_file.write(\"\\n## {}\\n\".format(org))\n",
    "        for dataset, files in group.groupby([\"dataset_title\", \"info_url\"]):\n",
    "            md_file.write(\"\\n### [{}]({})\\n\".format(dataset[0], dataset[1]))\n",
    "            for row in files.itertuples():\n",
    "                md_file.write(\n",
    "                    \"* [{}]({}) ({}, {})\\n\".format(\n",
    "                        row.file_title, row.download_url, row.format, row.file_issued\n",
    "                    )\n",
    "                )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.9 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  },
  "vscode": {
   "interpreter": {
    "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
   }
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}