{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Harvest GLAM datasets from data.gov.au\n", "\n", "**Because of problems with duplicate records in data.gov.au, I no longer use this notebook to harvest details of GLAM datasets. See [GLAM data from government portals](glam_data_from_gov_portals.ipynb) instead.**\n", "\n", "This is a quick attempt to harvest datasets published by GLAM institutions using the new [data.gov.au API](https://data.gov.au/api/v0/apidocs/index.html).\n", "\n", "To create the list of organisations, I searched the organisations on the [data.gov.au site](https://data.gov.au/) for 'library', 'archives', 'records', and 'museum'. I noticed that Queensland State Archives isn't included as an organisation, even though it's used as a tag, so I added it in as a query. There are inconsistencies in the way organisations are listed, so it's possible I've missed some." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import requests\n", "from IPython.display import FileLink, display\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "from slugify import slugify\n", "\n", "s = requests.Session()\n", "retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n", "s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n", "s.mount(\"https://\", HTTPAdapter(max_retries=retries))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "api_url = \"https://data.gov.au/api/v0/search/datasets\"\n", "organisations = [\n", " \"NSW State Archives\",\n", " \"National Archives of Australia\",\n", " \"Libraries Tasmania\",\n", " \"State Records\",\n", " \"State Records Office of Western Australia\",\n", " \"State Library of Victoria\",\n", " \"State Library of NSW\",\n", " \"Mount Gambier Library\",\n", " \"National Library of Australia\",\n", " \"State Library of Queensland\",\n", " \"State Library of Western Australia\",\n", " \"State Library of South Australia\",\n", " \"State Library of New South Wales\",\n", " \"Western Australian Museum\",\n", " \"South Australian Museum\",\n", " \"Museum of Applied Arts and Sciences\",\n", " \"Tasmanian Museum and Art Gallery\",\n", " \"History Trust of South Australia\",\n", " \"Australian Institute of Aboriginal and Torres Strait Islander Studies (AIATSIS)\",\n", " \"National Portrait Gallery\",\n", " \"Australian Museum\",\n", "]\n", "# No entries under organisations\n", "queries = ['\"Queensland State Archives\"', \"PROV Public Record Office\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def safe_get(dct, *keys):\n", " for key in keys:\n", " try:\n", " dct = dct[key]\n", " except (KeyError, TypeError):\n", " return None\n", " return dct\n", "\n", "\n", "def process_dataset(dataset, query=None):\n", " datafiles = []\n", " for dist in dataset[\"distributions\"]:\n", " if query:\n", " publisher = query.strip('\"')\n", " else:\n", " publisher = safe_get(dataset, \"publisher\", \"name\")\n", " datafile = {\n", " \"dataset_title\": safe_get(dataset, \"title\"),\n", " \"publisher\": publisher,\n", " \"dataset_issued\": safe_get(dataset, \"issued\"),\n", " \"dataset_modified\": safe_get(dataset, \"modified\"),\n", " \"dataset_description\": safe_get(dataset, \"description\"),\n", " \"source\": safe_get(dataset, \"catalog\"),\n", " \"info_url\": safe_get(dataset, \"landingPage\"),\n", " \"start_date\": safe_get(dataset, \"temporal\", \"start\", \"date\"),\n", " \"end_date\": safe_get(dataset, \"temporal\", \"end\", \"date\"),\n", " \"file_title\": safe_get(dist, \"title\"),\n", " \"download_url\": safe_get(dist, \"downloadURL\"),\n", " \"format\": safe_get(dist, \"format\"),\n", " \"file_description\": safe_get(dist, \"description\"),\n", " \"file_issued\": safe_get(dist, \"issued\"),\n", " \"file_modified\": safe_get(dist, \"modified\"),\n", " \"licence\": safe_get(dist, \"license\", \"name\"),\n", " }\n", " datafiles.append(datafile)\n", " return datafiles\n", "\n", "\n", "def harvest_datasets():\n", " datafiles = []\n", " for organisation in organisations:\n", " response = s.get(api_url, params={\"publisher\": organisation, \"limit\": 100})\n", " print(response.url)\n", " data = response.json()\n", " for dataset in data[\"dataSets\"]:\n", " datafiles += process_dataset(dataset)\n", " for query in queries:\n", " response = s.get(api_url, params={\"query\": query, \"limit\": 100})\n", " print(response.url)\n", " data = response.json()\n", " for dataset in data[\"dataSets\"]:\n", " datafiles += process_dataset(dataset, query=query)\n", " return datafiles" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "datafiles = harvest_datasets()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(datafiles)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"format\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"licence\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"publisher\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"glam_datasets_all_formats_from_datagovau.csv\", index=False)\n", "display(FileLink(\"glam_datasets_all_formats_from_datagovau.csv\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "csvs = df.loc[df[\"format\"] == \"CSV\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "csvs.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "csvs[\"publisher\"].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "csvs.to_csv(\"glam_datasets_csvs_from_datagovau.csv\", index=False)\n", "display(FileLink(\"glam_datasets_csvs_from_datagovau.csv\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Write results to a markdown file\n", "\n", "orgs = df.sort_values(by=[\"publisher\", \"dataset_title\", \"dataset_modified\"]).groupby(\n", " \"publisher\"\n", ")\n", "with open(\"glam_datasets_from_datagovau.md\", \"w\") as md_file:\n", " for org, group in orgs:\n", " print(\"* [{}](#{})\".format(org, slugify(org)))\n", " md_file.write(\"\\n## {}\\n\".format(org))\n", " for dataset, files in group.groupby([\"dataset_title\", \"info_url\"]):\n", " md_file.write(\"\\n### [{}]({})\\n\".format(dataset[0], dataset[1]))\n", " for row in files.itertuples():\n", " md_file.write(\n", " \"* [{}]({}) ({}, {})\\n\".format(\n", " row.file_title, row.download_url, row.format, row.file_issued\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.9 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" }, "vscode": { "interpreter": { "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }