{ "cells": [ { "cell_type": "code", "execution_count": 84, "id": "accessible-lender", "metadata": {}, "outputs": [], "source": [ "import io\n", "import json\n", "import sqlite3\n", "import zipfile\n", "from pathlib import Path\n", "\n", "import markdown2\n", "import pandas as pd\n", "import requests_cache\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "from slugify import slugify\n", "from sqlite_utils import Database\n", "\n", "s = requests_cache.CachedSession()\n", "retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n", "s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n", "s.mount(\"http://\", HTTPAdapter(max_retries=retries))" ] }, { "cell_type": "markdown", "id": "914151c9-7133-419c-ae8c-8bf7376663dc", "metadata": {}, "source": [ "## Save local copies of all CSV datasets" ] }, { "cell_type": "code", "execution_count": 184, "id": "graphic-phone", "metadata": {}, "outputs": [], "source": [ "df_csvs = pd.read_csv(\"glam-datasets-from-gov-portals-csvs.csv\")" ] }, { "cell_type": "code", "execution_count": 185, "id": "loved-kennedy", "metadata": {}, "outputs": [], "source": [ "df_csvs.fillna(\"\", inplace=True)" ] }, { "cell_type": "code", "execution_count": 186, "id": "pressing-cannon", "metadata": {}, "outputs": [], "source": [ "df_csvs[\"file_index\"] = df_csvs.apply(\n", " lambda x: f'{slugify(x[\"publisher\"])}-{slugify(x[\"file_title\"])}-{slugify(x[\"file_created\"][:10])}',\n", " axis=1,\n", ")" ] }, { "cell_type": "code", "execution_count": 106, "id": "suited-behavior", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROV Digitisation Program statistics\n", "PROV Workforce Data 19-20\n", "PROV Annual Report - Records Issued & Visitor Statistics 2015-2016\n" ] } ], "source": [ "def read_csv(url, header=0, encoding=0):\n", " \"\"\"\n", " Loop through some encoding/parsing options to see if we can get the CSV to open properly.\n", " \"\"\"\n", " encodings = [\"ISO-8859-1\", \"latin-1\"]\n", " headers = [None]\n", " try:\n", " if encoding > 0 and header > 0:\n", " df = pd.read_csv(\n", " url,\n", " sep=None,\n", " engine=\"python\",\n", " na_values=[\"-\", \" \"],\n", " encoding=encodings[encoding - 1],\n", " header=headers[header - 1],\n", " )\n", " elif encoding > 0:\n", " df = pd.read_csv(\n", " url,\n", " sep=None,\n", " engine=\"python\",\n", " na_values=[\"-\", \" \"],\n", " encoding=encodings[encoding - 1],\n", " )\n", " elif header > 0:\n", " df = pd.read_csv(\n", " url,\n", " sep=None,\n", " engine=\"python\",\n", " na_values=[\"-\", \" \"],\n", " header=headers[header - 1],\n", " )\n", " else:\n", " df = pd.read_csv(url, sep=None, engine=\"python\", na_values=[\"-\", \" \"])\n", " except UnicodeDecodeError:\n", " if encoding == len(encodings):\n", " raise\n", " else:\n", " return read_csv(url=url, header=header, encoding=encoding + 1)\n", " except pd.errors.ParserError:\n", " if header == len(headers):\n", " raise\n", " else:\n", " return read_csv(url=url, header=header + 1, encoding=encoding)\n", " else:\n", " return df\n", "\n", "\n", "for i, csv in enumerate(df_csvs.itertuples()):\n", " # print(csv.dataset_title)\n", " try:\n", " response = s.get(csv.download_url)\n", " response.raise_for_status\n", " except:\n", " print(csv.dataset_title)\n", " with Path(\"csvs\", f\"{csv.file_index}.csv\").open(\"w\") as csv_file:\n", " csv_file.write(response.text)" ] }, { "cell_type": "markdown", "id": "4f7025bc-bd5c-4232-a441-e1fc8d23c7e7", "metadata": {}, "source": [ "## Create a list of datasets for index checking" ] }, { "cell_type": "code", "execution_count": 189, "id": "boxed-broadway", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No details -- history-trust-of-sa-suffrage-petition\n", "Error -- queensland-state-archives-corporate-school-files-works-facilities-works-establishment-files-1871-1998-2018-02-23\n", "Error -- state-library-of-south-australia-fire-insurance-maps-1911-1914-2014-06-22\n", "No details -- public-records-office-victoria-outwards-passengers-from-victoria-1852-1915-2014-08-01\n", "No details -- history-trust-of-sa-passengers-in-history\n", "Error -- south-australian-museum-consultants-2017-18-2019-08-15\n", "Error -- nsw-state-archives-railway-employment-records-2014-09-30\n", "Error -- state-library-of-south-australia-19th-century-photographs-by-ernest-gall-2014-06-10\n", "Error -- queensland-museum-queensland-museum-collection-of-ethnographic-object-records-2014-06-25\n", "Error -- state-library-of-south-australia-bradman-collection-2013-11-18\n", "Error -- nsw-state-archives-nsw-govt-railways-and-tramways-roll-of-honour-1914-1919-csv-2014-09-30\n", "Error -- queensland-museum-queensland-museum-collection-of-historical-object-records-2014-06-25\n", "Error -- state-library-of-south-australia-election-leaflets-2013-05-08\n", "Error -- nsw-state-archives-nominal-roll-of-the-first-railway-section-aif-csv-2014-09-30\n", "Error -- state-library-of-queensland-nasla-music-csv-2013-05-29\n", "No details -- public-records-office-victoria-british-assisted-passengers-to-victoria-1839-1871-2014-08-01\n", "Error -- libraries-tasmania-archives-series-csv-2016-04-06\n", "Error -- south-australian-museum-workplace-health-and-safety-2017-18-2019-08-15\n", "Error -- state-library-of-south-australia-australia-1-63360-military-survey-s-a-1914-1958-2014-06-22\n", "Error -- state-library-of-south-australia-19th-century-photographs-by-townsend-duryea-2014-06-10\n", "Error -- public-records-office-victoria-reading-room-visitors-2014-12-23\n", "Error -- mount-gambier-library-commercial-street-traders-2014-06-10\n", "Error -- state-library-of-south-australia-election-posters-2013-05-08\n", "Error -- south-australian-museum-contractors-2017-18-2019-08-15\n", "No details -- public-records-office-victoria-unassisted-inward-passengers-1852-1923-2020-10-27\n" ] } ], "source": [ "# This will throw errors where the contents aren't CSV files\n", "# Delete these\n", "dfs = []\n", "for csv in Path(\"csvs\").glob(\"*.csv\"):\n", " # print(csv)\n", " file_index = csv.name.split(\".\")[0]\n", " # print(file_index)\n", " try:\n", " details = (\n", " df_csvs.loc[df_csvs[\"file_index\"] == file_index][\n", " [\"publisher\", \"info_url\", \"file_title\", \"file_modified\"]\n", " ]\n", " .iloc[0]\n", " .to_dict()\n", " )\n", " except IndexError:\n", " print(f\"No details -- {file_index}\")\n", " else:\n", " details[\"csv_file\"] = csv.name\n", " try:\n", " df_csv = pd.read_csv(csv, low_memory=False)\n", " except:\n", " print(f\"Error -- {file_index}\")\n", " else:\n", " details[\"columns\"] = \"|\".join(list(df_csv.columns))\n", " dfs.append(details)\n", "df = pd.DataFrame(dfs)" ] }, { "cell_type": "code", "execution_count": 190, "id": "1e32fe36-2031-4217-b3cd-3ae77071d13d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | dataset_title | \n", "publisher | \n", "author | \n", "dataset_issued | \n", "dataset_modified | \n", "dataset_description | \n", "source | \n", "info_url | \n", "start_date | \n", "end_date | \n", "... | \n", "download_url | \n", "format | \n", "file_description | \n", "file_created | \n", "file_modified | \n", "file_size | \n", "licence | \n", "file_index | \n", "csv_file | \n", "columns | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "State Library of Queensland - Real estate maps | \n", "State Library of Queensland | \n", "opendata@slq.qld.gov.au | \n", "2012-12-07T06:05:16.640302 | \n", "2020-12-09T05:55:15.871780 | \n", "A unique collection of original maps and plans... | \n", "data.qld.gov.au | \n", "https://data.qld.gov.au/dataset/959d611f-a9cf-... | \n", "\n", " | \n", " | ... | \n", "https://www.data.qld.gov.au/dataset/959d611f-a... | \n", "CSV | \n", "This updated dataset includes links to 798 dig... | \n", "2018-02-28T04:50:33.127516 | \n", "2019-08-19T06:18:57.312772 | \n", "252416 | \n", "Creative Commons Attribution 4.0 | \n", "state-library-of-queensland-real-estate-maps-f... | \n", "state-library-of-queensland-real-estate-maps-f... | \n", "Title|Description|Lat|Lon|Link|ID | \n", "
| 1 | \n", "Passport registers 1926 to 1939 | \n", "Queensland State Archives | \n", "web@archives.qld.gov.au | \n", "2013-10-14T06:10:08.409229 | \n", "2022-06-20T23:00:36.801163 | \n", "These indexes were compiled from the passport ... | \n", "data.qld.gov.au | \n", "https://data.qld.gov.au/dataset/fc87f25a-dc02-... | \n", "\n", " | \n", " | ... | \n", "https://www.data.qld.gov.au/dataset/fc87f25a-d... | \n", "CSV | \n", "This open data file lists the names of immigra... | \n", "2017-01-11T23:47:35.449465 | \n", "2022-01-10T04:53:27.827980 | \n", "2831155 | \n", "Creative Commons Attribution 4.0 | \n", "queensland-state-archives-passport-clearances-... | \n", "queensland-state-archives-passport-clearances-... | \n", "Last name|Given names|Notes|Date of arrival|Ye... | \n", "
| 2 | \n", "Assisted immigration 1848 to 1912 | \n", "Queensland State Archives | \n", "web@archives.qld.gov.au | \n", "2013-03-04T06:34:34.270023 | \n", "2022-06-20T12:57:24.964249 | \n", "These indexes were created from the [Registers... | \n", "data.qld.gov.au | \n", "https://data.qld.gov.au/dataset/ba182873-e8a7-... | \n", "\n", " | \n", " | ... | \n", "https://www.data.qld.gov.au/dataset/ba182873-e... | \n", "CSV | \n", "This open data file lists the names of assiste... | \n", "2013-03-05T23:30:57.308546 | \n", "2022-06-14T07:46:06.234434 | \n", "2621440 | \n", "Creative Commons Attribution 4.0 | \n", "queensland-state-archives-assisted-immigration... | \n", "queensland-state-archives-assisted-immigration... | \n", "Last name|Given names|Notes|Age|Ship|Date|Year... | \n", "
| 3 | \n", "Australian South Sea Islanders 1867 to 1908 | \n", "Queensland State Archives | \n", "web@archives.qld.gov.au | \n", "2014-06-25T04:29:57.438596 | \n", "2022-06-20T13:07:35.777233 | \n", "This index was compiled from a wide variety of... | \n", "data.qld.gov.au | \n", "https://data.qld.gov.au/dataset/eae0afa9-681c-... | \n", "\n", " | \n", " | ... | \n", "https://www.data.qld.gov.au/dataset/eae0afa9-6... | \n", "CSV | \n", "This open data file lists the names (L-Z) of A... | \n", "2017-01-11T01:32:27.747955 | \n", "2017-01-11T01:32:27.556535 | \n", "13107200 | \n", "Creative Commons Attribution 4.0 | \n", "queensland-state-archives-australian-south-sea... | \n", "queensland-state-archives-australian-south-sea... | \n", "Last name|Given name/s|Page|Date|Ref|Prev sys ... | \n", "
| 4 | \n", "Queensland Museum collection of protozoan spec... | \n", "Queensland Museum | \n", "opendata@qm.qld.gov.au | \n", "2014-02-18T23:18:45.102073 | \n", "2019-07-10T16:42:34.524484 | \n", "A list of specimens of protozoan species in Qu... | \n", "data.qld.gov.au | \n", "https://data.qld.gov.au/dataset/4f1071f2-f4fa-... | \n", "\n", " | \n", " | ... | \n", "http://www.qm.qld.gov.au/microsites/data/proto... | \n", "CSV | \n", "A CSV file containing records of all protozoan... | \n", "2014-02-18T23:19:05.331656 | \n", "2017-06-23T00:00:00 | \n", "41733324 | \n", "Creative Commons Attribution 4.0 | \n", "queensland-museum-queensland-museum-protozoan-... | \n", "queensland-museum-queensland-museum-protozoan-... | \n", "dcterms:type|dcterms:modified|dcterms:language... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 723 | \n", "SA FOI – number of fee waiver or reduction by ... | \n", "State Records South Australia | \n", "State Records | \n", "2018-02-02T04:38:06.752608 | \n", "2018-12-23T01:22:00.630016 | \n", "2017-18 annual reporting data on the number of... | \n", "data.sa.gov.au | \n", "https://data.sa.gov.au/data/dataset/f923f9b0-b... | \n", "2012-07-01 | \n", "2018-06-30 | \n", "... | \n", "https://data.sa.gov.au/data/dataset/f923f9b0-b... | \n", "CSV | \n", "2017-18 annual reporting data on the number of... | \n", "2018-02-02T15:38:27.930264 | \n", "2018-12-23T01:22:00.608148 | \n", "\n", " | Creative Commons Attribution | \n", "state-records-south-australia-sa-foi-number-of... | \n", "state-records-south-australia-sa-foi-number-of... | \n", "Reason for fee waiver, by sector|No. of waiver... | \n", "
| 724 | \n", "State Library of Queensland - Catalogue searches | \n", "State Library of Queensland | \n", "opendata@slq.qld.gov.au | \n", "2012-12-07T05:55:14.502123 | \n", "2021-03-08T07:42:00.611055 | \n", "This open data file contains the text strings ... | \n", "data.qld.gov.au | \n", "https://data.qld.gov.au/dataset/cebb997c-1c42-... | \n", "\n", " | \n", " | ... | \n", "https://www.data.qld.gov.au/dataset/cebb997c-1... | \n", "CSV | \n", "The text strings searched and count of recurri... | \n", "2019-06-18T06:37:31.010964 | \n", "2019-08-27T01:13:09.674578 | \n", "104448 | \n", "Creative Commons Attribution 4.0 | \n", "state-library-of-queensland-july-2017-catalogu... | \n", "state-library-of-queensland-july-2017-catalogu... | \n", "Search strings|Count | \n", "
| 725 | \n", "World War I Soldiers and Nurses (1914-1928). | \n", "Libraries Tasmania | \n", "Libraries Tasmania | \n", "2015-06-15T03:04:09.056176 | \n", "2021-11-23T14:36:42.489452 | \n", "Photographs, articles and applications for lan... | \n", "data.gov.au | \n", "https://data.gov.au/dataset/b711231a-2a02-48eb... | \n", "1914 | \n", "1928 | \n", "... | \n", "https://data.gov.au/data/dataset/b711231a-2a02... | \n", "CSV | \n", "\n", " | 2016-03-22T10:01:58.539607 | \n", "2021-11-23 | \n", "2835528 | \n", "Creative Commons Attribution 4.0 International | \n", "libraries-tasmania-world-war-one-tasmanian-pho... | \n", "libraries-tasmania-world-war-one-tasmanian-pho... | \n", "DIGITAL_OBJECT - URL_TEXT|DIGITAL_OBJECT - URL... | \n", "
| 726 | \n", "Deceased Estate Files, 1880-1923 | \n", "NSW State Archives | \n", "State Records Authority | \n", "2014-09-30T04:52:48.805972 | \n", "2016-07-20T12:09:20.785878 | \n", "Researching deceased estates files before 1923... | \n", "data.nsw.gov.au | \n", "https://data.nsw.gov.au/data/dataset/5d45437c-... | \n", "\n", " | \n", " | ... | \n", "https://data.nsw.gov.au/data/dataset/5d45437c-... | \n", "CSV | \n", "This dataset contains the following attributes... | \n", "2014-09-30T00:55:53.313012 | \n", "\n", " | \n", " | Creative Commons Attribution | \n", "nsw-state-archives-deceased-estates-2014-09-30 | \n", "nsw-state-archives-deceased-estates-2014-09-30... | \n", "Surname|FirstName|Locality|DateOfDeath|DateDut... | \n", "
| 727 | \n", "SA Memory | \n", "State Library of South Australia | \n", "State Library of South Australia | \n", "2013-03-07T16:15:35.228085 | \n", "2019-08-29T02:29:51.427322 | \n", "A selected and wide range of digitised archiva... | \n", "data.sa.gov.au | \n", "https://data.sa.gov.au/data/dataset/7cd90f98-1... | \n", "1836-2010 | \n", "\n", " | ... | \n", "https://data.sa.gov.au/data/dataset/7cd90f98-1... | \n", "CSV | \n", "A selected and wide range of digitised archiva... | \n", "2013-05-31T01:01:00.469271 | \n", "2019-08-28T23:40:58.400220 | \n", "1495812 | \n", "Creative Commons Attribution | \n", "state-library-of-south-australia-sa-memory-201... | \n", "state-library-of-south-australia-sa-memory-201... | \n", "id|TITLE|CREATOR|INNOPAC|LINK|coverage_place|C... | \n", "
728 rows × 21 columns
\n", "| \n", " | publisher | \n", "info_url | \n", "csv_file | \n", "index | \n", "drop | \n", "extract | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "Australian Institute of Aboriginal and Torres ... | \n", "https://data.gov.au/dataset/11cbf24a-a31a-488c... | \n", "australian-institute-of-aboriginal-and-torres-... | \n", "\n", " | \n", " | \n", " |
| 1 | \n", "Libraries Tasmania | \n", "https://data.gov.au/dataset/b0627a17-6783-4c18... | \n", "libraries-tasmania-bankruptcy-csv-2017-07-14.csv | \n", "NAME|NAME_SEE_ALSO | \n", "\n", " | \n", " |
| 2 | \n", "Libraries Tasmania | \n", "https://data.gov.au/dataset/069a423b-abd8-4454... | \n", "libraries-tasmania-colonial-secretary-correspo... | \n", "DESC|NAME|NAME_SEE_ALSO | \n", "\n", " | \n", " |
| 3 | \n", "Libraries Tasmania | \n", "https://data.gov.au/dataset/58a9a8d7-01e0-43df... | \n", "libraries-tasmania-court-csv-2017-07-14.csv | \n", "NAME | \n", "\n", " | \n", " |
| 4 | \n", "Libraries Tasmania | \n", "https://data.gov.au/dataset/d7ec2d93-b9dd-482b... | \n", "libraries-tasmania-digitised-archives-csv-2016... | \n", "\n", " | \n", " | \n", " |
| \n", " | publisher | \n", "info_url | \n", "file_title_x | \n", "file_modified_x | \n", "csv_file | \n", "file_title_y | \n", "file_modified_y | \n", "columns | \n", "index | \n", "drop | \n", "extract | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "State Library of Queensland | \n", "https://data.qld.gov.au/dataset/959d611f-a9cf-... | \n", "Real Estate Maps February 2018 | \n", "2019-08-19T06:18:57.312772 | \n", "state-library-of-queensland-real-estate-maps-f... | \n", "Real Estate Maps February 2018 | \n", "2019-08-19T06:18:57.313 | \n", "Title|Description|Lat|Lon|Link|ID | \n", "\n", " | \n", " | \n", " |
| 1 | \n", "Queensland State Archives | \n", "https://data.qld.gov.au/dataset/fc87f25a-dc02-... | \n", "Passport clearances 1923 to 1940 | \n", "2022-01-10T04:53:27.827980 | \n", "queensland-state-archives-passport-clearances-... | \n", "Passport clearances 1923 to 1940 | \n", "2022-01-10T04:53:27.828 | \n", "Last name|Given names|Notes|Date of arrival|Ye... | \n", "Last name|Given names | \n", "Description | \n", "\n", " |
| 2 | \n", "Queensland State Archives | \n", "https://data.qld.gov.au/dataset/ba182873-e8a7-... | \n", "Assisted immigration 1848 to 1912 - A | \n", "2022-06-14T07:46:06.234434 | \n", "queensland-state-archives-assisted-immigration... | \n", "Assisted immigration 1848 to 1912 - A | \n", "2022-06-14T07:46:06.234 | \n", "Last name|Given names|Notes|Age|Ship|Date|Year... | \n", "\n", " | \n", " | \n", " |
| 3 | \n", "Queensland State Archives | \n", "https://data.qld.gov.au/dataset/eae0afa9-681c-... | \n", "Australian South Sea Islanders 1867 to 1908 L-Z | \n", "2017-01-11T01:32:27.556535 | \n", "queensland-state-archives-australian-south-sea... | \n", "Australian South Sea Islanders 1867 to 1908 L-Z | \n", "2017-01-11T01:32:27.557 | \n", "Last name|Given name/s|Page|Date|Ref|Prev sys ... | \n", "\n", " | \n", " | \n", " |
| 4 | \n", "Queensland Museum | \n", "https://data.qld.gov.au/dataset/4f1071f2-f4fa-... | \n", "Queensland Museum protozoan collection records | \n", "2017-06-23T00:00:00 | \n", "queensland-museum-queensland-museum-protozoan-... | \n", "Queensland Museum protozoan collection records | \n", "2017-06-23T00:00:00 | \n", "dcterms:type|dcterms:modified|dcterms:language... | \n", "\n", " | \n", " | \n", " |
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 723 | \n", "State Records South Australia | \n", "https://data.sa.gov.au/data/dataset/f923f9b0-b... | \n", "SA FOI – number of fee waiver or reduction by ... | \n", "2018-12-23T01:22:00.608148 | \n", "state-records-south-australia-sa-foi-number-of... | \n", "SA FOI – number of fee waiver or reduction by ... | \n", "2018-12-23T01:22:00.608 | \n", "Reason for fee waiver, by sector|No. of waiver... | \n", "\n", " | \n", " | \n", " |
| 724 | \n", "State Library of Queensland | \n", "https://data.qld.gov.au/dataset/cebb997c-1c42-... | \n", "July 2017 Catalogue searches | \n", "2019-08-27T01:13:09.674578 | \n", "state-library-of-queensland-july-2017-catalogu... | \n", "July 2017 Catalogue searches | \n", "2019-08-27T01:13:09.675 | \n", "Search strings|Count | \n", "\n", " | \n", " | \n", " |
| 725 | \n", "Libraries Tasmania | \n", "https://data.gov.au/dataset/b711231a-2a02-48eb... | \n", "World War One Tasmanian Photographs - CSV | \n", "2021-11-23 | \n", "libraries-tasmania-world-war-one-tasmanian-pho... | \n", "World War One Tasmanian Photographs - CSV | \n", "2021-11-23 | \n", "DIGITAL_OBJECT - URL_TEXT|DIGITAL_OBJECT - URL... | \n", "NAME | \n", "\n", " | \n", " |
| 726 | \n", "NSW State Archives | \n", "https://data.nsw.gov.au/data/dataset/5d45437c-... | \n", "Deceased Estates | \n", "\n", " | nsw-state-archives-deceased-estates-2014-09-30... | \n", "Deceased Estates | \n", "\n", " | Surname|FirstName|Locality|DateOfDeath|DateDut... | \n", "\n", " | \n", " | \n", " |
| 727 | \n", "State Library of South Australia | \n", "https://data.sa.gov.au/data/dataset/7cd90f98-1... | \n", "SA Memory | \n", "2019-08-28T23:40:58.400220 | \n", "state-library-of-south-australia-sa-memory-201... | \n", "SA Memory | \n", "2019-08-28T23:40:58.400 | \n", "id|TITLE|CREATOR|INNOPAC|LINK|coverage_place|C... | \n", "\n", " | \n", " | \n", " |
728 rows × 11 columns
\n", "Search for names across an aggregated collection of name indexes from Australian GLAM organisations.
\n", "For more information about the datasets, see the GLAM data portals section of the GLAM Workbench.
\n", "\"\"\",\n", " \"databases\": {},\n", "}\n", "\n", "for org, csvs in df_final.groupby(by=\"publisher\"):\n", " metadata[\"databases\"][slugify(org)] = {\"title\": org, \"tables\": {}}\n", " db = Database(sqlite3.connect(f\"{slugify(org)}.db\"))\n", " for csv in csvs.itertuples():\n", " print(csv.csv_file)\n", " if csv.dataset_title != csv.file_title_y:\n", " title = f\"{csv.dataset_title} – {csv.file_title_y}\"\n", " else:\n", " title = csv.file_title_y\n", " # print(title)\n", " if csv.dataset_description != csv.file_description:\n", " description = f\"{markdown2.markdown(str(csv.dataset_description))}{markdown2.markdown(str(csv.file_description))}\"\n", " else:\n", " description = markdown2.markdown(str(csv.dataset_description))\n", " if csv.file_modified_y:\n", " description += f\"Last modified: {csv.file_modified_y}
\"\n", " table_data = {\n", " \"title\": title,\n", " \"description_html\": description,\n", " \"source_url\": csv.download_url,\n", " \"about_url\": csv.info_url,\n", " \"license\": csv.licence,\n", " \"searchmode\": \"raw\",\n", " }\n", " metadata[\"databases\"][slugify(org)][\"tables\"][\n", " slugify(csv.file_title_y)\n", " ] = table_data\n", " table = db[slugify(csv.file_title_y)]\n", " df_csv = pd.read_csv(\n", " Path(\"csvs\", csv.csv_file), keep_default_na=False, low_memory=False\n", " )\n", " for col in csv.drop.split(\"|\"):\n", " if col:\n", " df_csv.drop(columns=col, inplace=True)\n", " for col in csv.extract.split(\"|\"):\n", " if col:\n", " df_csv[f\"{col}_url\"] = df_csv[col].str.extract(r\"(http.*?)'\")\n", " df_csv.drop(columns=col, inplace=True)\n", " table.insert_all(df_csv.to_dict(\"records\"))\n", " cols_to_index = csv.index.split(\"|\")\n", " # print(cols_to_index)\n", " table.enable_fts(cols_to_index)\n", "\n", "with Path(\"metadata.json\").open(\"w\") as json_file:\n", " json_file.write(json.dumps(metadata))" ] }, { "cell_type": "code", "execution_count": 217, "id": "alike-carrier", "metadata": {}, "outputs": [], "source": [ "with Path(\"metadata.json\").open(\"w\") as json_file:\n", " json_file.write(json.dumps(metadata))" ] }, { "cell_type": "code", "execution_count": 12, "id": "revised-harris", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'history-trust-of-south-australia.db libraries-tasmania.db nsw-state-archives.db public-records-office-victoria.db queensland-state-archives.db state-library-of-queensland.db state-library-of-south-australia.db state-library-of-western-australia.db state-records-office-of-western-australia.db'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\" \".join(\n", " sorted(\n", " [p.name for p in Path(\"/Volumes/Workspace/mycode/ozglam-data/src\").glob(\"*.db\")]\n", " )\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.9 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" }, "vscode": { "interpreter": { "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" } }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }