{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Export a Trove list to Zotero" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import datetime\n", "from urllib.parse import urlparse, urlsplit\n", "import requests\n", "import os.path\n", "import tempfile\n", "import copy\n", "import re\n", "from pyzotero import zotero\n", "from trove import Trove\n", "\n", "TROVE_ZOTERO_MAPPINGS = {\n", " 'Art work': 'artwork',\n", " 'Article': 'journalArticle',\n", " 'Article/Book chapter': 'bookSection',\n", " 'Article/Conference paper': 'conferencePaper',\n", " 'Article/Journal or magazine article': 'journalArticle',\n", " 'Article/Other article': 'journalArticle',\n", " 'Article/Report': 'report',\n", " 'Article/Review': 'journalArticle',\n", " 'Article/Working paper': 'report',\n", " 'Audio book': 'book',\n", " 'Book': 'book',\n", " 'Book/Braille': 'book',\n", " 'Book/Illustrated': 'book',\n", " 'Book/Large print': 'book',\n", " 'Conference Proceedings': 'book',\n", " 'Data set': 'computerProgram',\n", " 'Map': 'map',\n", " 'Map/Aerial photograph': 'map',\n", " 'Map/Atlas': 'map',\n", " 'Map/Braille': 'map',\n", " 'Map/Electronic': 'map',\n", " 'Map/Globe or object': 'map',\n", " 'Map/Large print': 'map',\n", " 'Map/Map series': 'map',\n", " 'Map/Microform': 'map',\n", " 'Map/Single map': 'map',\n", " 'Object': 'artwork',\n", " 'Periodical': 'book',\n", " 'Periodical/Journal, magazine, other': 'book',\n", " 'Periodical/Newspaper': 'book',\n", " 'Photograph': 'artwork',\n", " 'Poster, chart, other': 'artwork',\n", " 'Published': 'document',\n", " 'Sheet music': 'document',\n", " 'Sound': 'audioRecording',\n", " 'Sound/Interview, lecture, talk': 'audioRecording',\n", " 'Sound/Other sound': 'audioRecording',\n", " 'Sound/Recorded music': 'audioRecording',\n", " 'Thesis': 'thesis',\n", " 'Unpublished': 'manuscript',\n", " 'Video': 'videoRecording',\n", " 'Video/Captioned': 'videoRecording'\n", "}\n", "\n", "FIELD_MAPPINGS = {\n", " #Not including common fields that have multiple values\n", " # contributor -> creator\n", " # tags -> tags\n", " # subject -> tags\n", " 'journalArticle': { \n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'isPartOf': 'publicationTitle',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'pagination': 'pages',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'book': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'publisher',\n", " 'edition': 'edition',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'bookSection': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'isPartOf': 'bookTitle',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'pagination': 'pages',\n", " 'publisher': 'publisher',\n", " 'edition': 'edition',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'conferencePaper': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'isPartOf': 'proceedingsTitle',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'pagination': 'pages',\n", " 'publisher': 'publisher',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'report': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'institution',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'thesis': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'university',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'artwork': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'map': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'publisher',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'computerProgram': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'company',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'document': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'publisher',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'manuscript': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'audioRecording': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'isPartOf': 'seriesTitle',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'label',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'videoRecording': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'isPartOf': 'seriesTitle',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'studio',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'encyclopediaArticle': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'isPartOf': 'encyclopediaTitle',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'publisher': 'publisher',\n", " 'repository': 'archive',\n", " 'rights': 'rights',\n", " 'source': 'libraryCatalog'\n", " },\n", " 'webpage': {\n", " 'title': 'title',\n", " 'troveUrl': 'url',\n", " 'issued': 'date',\n", " 'isPartOf': 'websiteTitle',\n", " 'language': 'language',\n", " 'abstract': 'abstractNote',\n", " 'rights': 'rights'\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def guess_zotero_type(item_type):\n", " '''\n", " Check mappings to try and find a zotero type.\n", " '''\n", " if isinstance(item_type, list):\n", " item_type = item_type[0]\n", " try:\n", " zotero_type = TROVE_ZOTERO_MAPPINGS[item_type]\n", " except KeyError:\n", " zotero_type = 'journalArticle'\n", " return zotero_type\n", "\n", "\n", "def process_name(name):\n", " '''\n", " Try and do some cleaning of names that have dates included.\n", " '''\n", " parts = name.split(',')\n", " if len(parts) > 1:\n", " family_name = parts[0]\n", " other_names = parts[1]\n", " else:\n", " family_name = parts[0]\n", " other_names = ''\n", " return {'family_name': family_name, 'other_names': other_names}\n", "\n", "\n", "def extract_filename_from_url(url):\n", " '''\n", " Try and get the filename of attachments.\n", " '''\n", " filename = os.path.basename(urlsplit(url).path)\n", " if not re.search(r'\\.jpg|\\.gif|\\.png|\\.tif\\.pdf', filename):\n", " filename = None\n", " return filename\n", "\n", "\n", "def prepare_attachment(url, default):\n", " '''\n", " Save a local copy of attachment, and return the local file path.\n", " '''\n", " response = requests.get(url)\n", " filename = extract_filename_from_url(url)\n", " if not filename:\n", " filename = default\n", " folder = tempfile.gettempdir()\n", " filename = os.path.join(folder, filename)\n", " with open(filename, 'wb') as attachment:\n", " attachment.write(response.content)\n", " return filename\n", "\n", "\n", "def prepare_tags(tags):\n", " '''\n", " Takes a list of tags and formats in the object format expected by Zotero.\n", " '''\n", " return [{'tag': tag} for tag in tags]\n", "\n", "\n", "def get_newspaper_pdf(article_id):\n", " '''\n", " Use my proxy app to get the url to the PDF copy of an article.\n", " '''\n", " response = requests.get('https://trove-proxy.herokuapp.com/pdf/{}'.format(article_id))\n", " return response.text\n", "\n", "\n", "def create_zotero_object(zotero_api, trove_api, record):\n", " '''\n", " Process Trove record to populate fields in Zotero item template.\n", " '''\n", " attachments = []\n", " item_type = list(record.keys())[0]\n", " item = record[item_type]\n", " if item_type == 'work':\n", " zotero_type = guess_zotero_type(item['type'])\n", " zotero_template = zotero_api.item_template(zotero_type)\n", " template = copy.deepcopy(zotero_template)\n", " work = trove_api.get_item(item_id=item['id'], item_type='work')\n", " details = work.get_details()\n", " fields = FIELD_MAPPINGS[zotero_type]\n", " for t_field, z_field in fields.items():\n", " if t_field in details:\n", " template[z_field] = '; '.join(details[t_field])\n", " if 'contributor' in details:\n", " for index, contributor in enumerate(details['contributor']):\n", " names = process_name(contributor)\n", " template['creators'][index]['firstName'] = names['other_names']\n", " template['creators'][index]['lastName'] = names['family_name']\n", " tags = work.get_all_tags()\n", " if 'subject' in details:\n", " tags.extend(details['subject'])\n", " if tags:\n", " template['tags'] = prepare_tags(tags)\n", " source = work.get_repository()\n", " repository = None\n", " if source['nuc']:\n", " contributor = trove_api.get_item(item_id=source['nuc'], item_type='contributor')\n", " repository = contributor.get_title()\n", " elif source['repository']:\n", " repository = source['repository']\n", " if repository:\n", " template['archive'] = repository\n", " urls = work.get_urls()\n", " if 'mediumresolution' in urls:\n", " image_url = urls['mediumresolution']\n", " elif 'thumbnail' in urls:\n", " image_url = urls['thumbnail']\n", " else:\n", " image_url = None\n", " if image_url:\n", " attachments.append(prepare_attachment(image_url, 'image.jpg'))\n", " pdf_url = work.get_pdf_url()\n", " if pdf_url:\n", " attachments.append(prepare_attachment(pdf_url, 'article.pdf'))\n", " \n", " elif item_type == 'people':\n", " zotero_type = 'encyclopediaArticle'\n", " template = zotero_api.item_template(zotero_type)\n", " template['title'] = 'Trove party record'\n", " template['url'] = item['troveUrl']\n", "\n", " elif item_type == 'article':\n", " zotero_type = 'newspaperArticle'\n", " template = zotero_api.item_template(zotero_type)\n", " template['title'] = item['heading']\n", " template['url'] = 'http://nla.gov.au/nla.news-article' + item['id']\n", " template['publicationTitle'] = item['title']['value']\n", " template['pages'] = item['page']\n", " template['date'] = item['date']\n", " pdf_url = get_newspaper_pdf(item['id'])\n", " attachments.append(prepare_attachment(pdf_url, 'article-{}.pdf'.format(item['id'])))\n", "\n", " elif item_type == 'externalWebsite':\n", " zotero_type = 'webpage'\n", " template = zotero_api.item_template(zotero_type)\n", " template['title'] = item['title']\n", " template['url'] = item['identifier']['value']\n", "\n", " if template['itemType'] != 'webpage':\n", " template['libraryCatalog'] = 'Trove'\n", " template['accessDate'] = datetime.datetime.now().date().isoformat()\n", " return {'zotero_item': template, 'attachments': attachments}\n", "\n", "def create_zotero_collection(zotero_api, collection_name):\n", " '''\n", " Creates a Zotero collection with the given name,\n", " then retrieves the key for that collection.\n", " '''\n", " collection_key = None\n", " if collection_name:\n", " created = zotero_api.create_collections([{'name': collection_name}])\n", " if created:\n", " collections = zotero_api.collections(q=collection_name)\n", " try:\n", " collection_key = collections[0]['key']\n", " except (IndexError, KeyError):\n", " print('Error retrieving collection key.')\n", " else:\n", " print('Error creating collection.')\n", " return collection_key\n", "\n", "def check_duplicate_collection(zotero_api, collection_name):\n", " '''\n", " Check to see if a collection with the supplied name already exists.\n", " '''\n", " collections = zotero_api.collections(q=collection_name)\n", " if collections:\n", " collection_key = collections[0]['key']\n", " else:\n", " collection_key = None\n", " return collection_key\n", "\n", "def export_list(list_id, zotero_api, trove_api):\n", " trove_list = trove_api.get_item(item_id=list_id, item_type='list')\n", " list_name = '{} (Trove list: {})'.format(trove_list.get_title(), list_id)\n", " print(list_name)\n", " collection_key = check_duplicate_collection(zotero_api, list_name)\n", " if not collection_key:\n", " collection_key = create_zotero_collection(zotero_api, list_name)\n", " if collection_key:\n", " for item in trove_list.list_items:\n", " if 'deleted' not in item:\n", " details = create_zotero_object(zotero_api, trove_api, item)\n", " zotero_item = details['zotero_item']\n", " zotero_item['collections'] = [collection_key]\n", " response = zotero_api.create_items([zotero_item])\n", " # print(response)\n", " if details['attachments']:\n", " zotero_api.attachment_simple(details['attachments'], response['successful']['0']['key'])\n", " print('New item added: {}'.format(zotero_item['title']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add your details here" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# This is a temporary group id created for demo purposes, replace it with your own personal or group library id\n", "zotero_library_id = '2315662'\n", "# Type should be either 'user' or 'group'\n", "zotero_library_type = 'group' # either 'user' or 'group'\n", "# This is a temporary key created for demonstration purposes\n", "zotero_library_key = 'zUjtvoJwuUNTwxKGIbj6t8wt'\n", "\n", "zot_api = zotero.Zotero(zotero_library_id, zotero_library_type, zotero_library_key)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Replace this with the id of the list you want to export\n", "trove_list_id = '83777'\n", "# This is a temporary key created for demonstration purposes, replace it with your own\n", "trove_api_key = 'ju3rgk0jp354ikmh'\n", "\n", "trove_api = Trove(trove_api_key)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Start the export" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "export_list(trove_list_id, zot_api, trove_api)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }