{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Export a Trove list to Zotero"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "from urllib.parse import urlparse, urlsplit\n",
    "import requests\n",
    "import os.path\n",
    "import tempfile\n",
    "import copy\n",
    "import re\n",
    "from pyzotero import zotero\n",
    "from trove import Trove\n",
    "\n",
    "TROVE_ZOTERO_MAPPINGS = {\n",
    "    'Art work': 'artwork',\n",
    "    'Article': 'journalArticle',\n",
    "    'Article/Book chapter': 'bookSection',\n",
    "    'Article/Conference paper': 'conferencePaper',\n",
    "    'Article/Journal or magazine article': 'journalArticle',\n",
    "    'Article/Other article': 'journalArticle',\n",
    "    'Article/Report': 'report',\n",
    "    'Article/Review': 'journalArticle',\n",
    "    'Article/Working paper': 'report',\n",
    "    'Audio book': 'book',\n",
    "    'Book': 'book',\n",
    "    'Book/Braille': 'book',\n",
    "    'Book/Illustrated': 'book',\n",
    "    'Book/Large print': 'book',\n",
    "    'Conference Proceedings': 'book',\n",
    "    'Data set': 'computerProgram',\n",
    "    'Map': 'map',\n",
    "    'Map/Aerial photograph': 'map',\n",
    "    'Map/Atlas': 'map',\n",
    "    'Map/Braille': 'map',\n",
    "    'Map/Electronic': 'map',\n",
    "    'Map/Globe or object': 'map',\n",
    "    'Map/Large print': 'map',\n",
    "    'Map/Map series': 'map',\n",
    "    'Map/Microform': 'map',\n",
    "    'Map/Single map': 'map',\n",
    "    'Object': 'artwork',\n",
    "    'Periodical': 'book',\n",
    "    'Periodical/Journal, magazine, other': 'book',\n",
    "    'Periodical/Newspaper': 'book',\n",
    "    'Photograph': 'artwork',\n",
    "    'Poster, chart, other': 'artwork',\n",
    "    'Published': 'document',\n",
    "    'Sheet music': 'document',\n",
    "    'Sound': 'audioRecording',\n",
    "    'Sound/Interview, lecture, talk': 'audioRecording',\n",
    "    'Sound/Other sound': 'audioRecording',\n",
    "    'Sound/Recorded music': 'audioRecording',\n",
    "    'Thesis': 'thesis',\n",
    "    'Unpublished': 'manuscript',\n",
    "    'Video': 'videoRecording',\n",
    "    'Video/Captioned': 'videoRecording'\n",
    "}\n",
    "\n",
    "FIELD_MAPPINGS = {\n",
    "    #Not including common fields that have multiple values\n",
    "    # contributor -> creator\n",
    "    # tags -> tags\n",
    "    # subject -> tags\n",
    "    'journalArticle': { \n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'isPartOf': 'publicationTitle',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'pagination': 'pages',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'book': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'publisher',\n",
    "        'edition': 'edition',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'bookSection': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'isPartOf': 'bookTitle',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'pagination': 'pages',\n",
    "        'publisher': 'publisher',\n",
    "        'edition': 'edition',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'conferencePaper': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'isPartOf': 'proceedingsTitle',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'pagination': 'pages',\n",
    "        'publisher': 'publisher',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'report': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'institution',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'thesis': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'university',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'artwork': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'map': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'publisher',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'computerProgram': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'company',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'document': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'publisher',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'manuscript': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'audioRecording': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'isPartOf': 'seriesTitle',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'label',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'videoRecording': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'isPartOf': 'seriesTitle',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'studio',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'encyclopediaArticle': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'isPartOf': 'encyclopediaTitle',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'publisher': 'publisher',\n",
    "        'repository': 'archive',\n",
    "        'rights': 'rights',\n",
    "        'source': 'libraryCatalog'\n",
    "    },\n",
    "    'webpage': {\n",
    "        'title': 'title',\n",
    "        'troveUrl': 'url',\n",
    "        'issued': 'date',\n",
    "        'isPartOf': 'websiteTitle',\n",
    "        'language': 'language',\n",
    "        'abstract': 'abstractNote',\n",
    "        'rights': 'rights'\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def guess_zotero_type(item_type):\n",
    "    '''\n",
    "    Check mappings to try and find a zotero type.\n",
    "    '''\n",
    "    if isinstance(item_type, list):\n",
    "        item_type = item_type[0]\n",
    "    try:\n",
    "        zotero_type = TROVE_ZOTERO_MAPPINGS[item_type]\n",
    "    except KeyError:\n",
    "        zotero_type = 'journalArticle'\n",
    "    return zotero_type\n",
    "\n",
    "\n",
    "def process_name(name):\n",
    "    '''\n",
    "    Try and do some cleaning of names that have dates included.\n",
    "    '''\n",
    "    parts = name.split(',')\n",
    "    if len(parts) > 1:\n",
    "        family_name = parts[0]\n",
    "        other_names = parts[1]\n",
    "    else:\n",
    "        family_name = parts[0]\n",
    "        other_names = ''\n",
    "    return {'family_name': family_name, 'other_names': other_names}\n",
    "\n",
    "\n",
    "def extract_filename_from_url(url):\n",
    "    '''\n",
    "    Try and get the filename of attachments.\n",
    "    '''\n",
    "    filename = os.path.basename(urlsplit(url).path)\n",
    "    if not re.search(r'\\.jpg|\\.gif|\\.png|\\.tif\\.pdf', filename):\n",
    "        filename = None\n",
    "    return filename\n",
    "\n",
    "\n",
    "def prepare_attachment(url, default):\n",
    "    '''\n",
    "    Save a local copy of attachment, and return the local file path.\n",
    "    '''\n",
    "    response = requests.get(url)\n",
    "    filename = extract_filename_from_url(url)\n",
    "    if not filename:\n",
    "        filename = default\n",
    "    folder = tempfile.gettempdir()\n",
    "    filename = os.path.join(folder, filename)\n",
    "    with open(filename, 'wb') as attachment:\n",
    "        attachment.write(response.content)\n",
    "    return filename\n",
    "\n",
    "\n",
    "def prepare_tags(tags):\n",
    "    '''\n",
    "    Takes a list of tags and formats in the object format expected by Zotero.\n",
    "    '''\n",
    "    return [{'tag': tag} for tag in tags]\n",
    "\n",
    "\n",
    "def get_newspaper_pdf(article_id):\n",
    "    '''\n",
    "    Use my proxy app to get the url to the PDF copy of an article.\n",
    "    '''\n",
    "    response = requests.get('https://trove-proxy.herokuapp.com/pdf/{}'.format(article_id))\n",
    "    return response.text\n",
    "\n",
    "\n",
    "def create_zotero_object(zotero_api, trove_api, record):\n",
    "    '''\n",
    "    Process Trove record to populate fields in Zotero item template.\n",
    "    '''\n",
    "    attachments = []\n",
    "    item_type = list(record.keys())[0]\n",
    "    item = record[item_type]\n",
    "    if item_type == 'work':\n",
    "        zotero_type = guess_zotero_type(item['type'])\n",
    "        zotero_template = zotero_api.item_template(zotero_type)\n",
    "        template = copy.deepcopy(zotero_template)\n",
    "        work = trove_api.get_item(item_id=item['id'], item_type='work')\n",
    "        details = work.get_details()\n",
    "        fields = FIELD_MAPPINGS[zotero_type]\n",
    "        for t_field, z_field in fields.items():\n",
    "            if t_field in details:\n",
    "                template[z_field] = '; '.join(details[t_field])\n",
    "        if 'contributor' in details:\n",
    "            for index, contributor in enumerate(details['contributor']):\n",
    "                names = process_name(contributor)\n",
    "                template['creators'][index]['firstName'] = names['other_names']\n",
    "                template['creators'][index]['lastName'] = names['family_name']\n",
    "        tags = work.get_all_tags()\n",
    "        if 'subject' in details:\n",
    "            tags.extend(details['subject'])\n",
    "        if tags:\n",
    "            template['tags'] = prepare_tags(tags)\n",
    "        source = work.get_repository()\n",
    "        repository = None\n",
    "        if source['nuc']:\n",
    "            contributor = trove_api.get_item(item_id=source['nuc'], item_type='contributor')\n",
    "            repository = contributor.get_title()\n",
    "        elif source['repository']:\n",
    "            repository = source['repository']\n",
    "        if repository:\n",
    "            template['archive'] = repository\n",
    "        urls = work.get_urls()\n",
    "        if 'mediumresolution' in urls:\n",
    "            image_url = urls['mediumresolution']\n",
    "        elif 'thumbnail' in urls:\n",
    "            image_url = urls['thumbnail']\n",
    "        else:\n",
    "            image_url = None\n",
    "        if image_url:\n",
    "            attachments.append(prepare_attachment(image_url, 'image.jpg'))\n",
    "        pdf_url = work.get_pdf_url()\n",
    "        if pdf_url:\n",
    "            attachments.append(prepare_attachment(pdf_url, 'article.pdf'))\n",
    "        \n",
    "    elif item_type == 'people':\n",
    "        zotero_type = 'encyclopediaArticle'\n",
    "        template = zotero_api.item_template(zotero_type)\n",
    "        template['title'] = 'Trove party record'\n",
    "        template['url'] = item['troveUrl']\n",
    "\n",
    "    elif item_type == 'article':\n",
    "        zotero_type = 'newspaperArticle'\n",
    "        template = zotero_api.item_template(zotero_type)\n",
    "        template['title'] = item['heading']\n",
    "        template['url'] = 'http://nla.gov.au/nla.news-article' + item['id']\n",
    "        template['publicationTitle'] = item['title']['value']\n",
    "        template['pages'] = item['page']\n",
    "        template['date'] = item['date']\n",
    "        pdf_url = get_newspaper_pdf(item['id'])\n",
    "        attachments.append(prepare_attachment(pdf_url, 'article-{}.pdf'.format(item['id'])))\n",
    "\n",
    "    elif item_type == 'externalWebsite':\n",
    "        zotero_type = 'webpage'\n",
    "        template = zotero_api.item_template(zotero_type)\n",
    "        template['title'] = item['title']\n",
    "        template['url'] = item['identifier']['value']\n",
    "\n",
    "    if template['itemType'] != 'webpage':\n",
    "        template['libraryCatalog'] = 'Trove'\n",
    "    template['accessDate'] = datetime.datetime.now().date().isoformat()\n",
    "    return {'zotero_item': template, 'attachments': attachments}\n",
    "\n",
    "def create_zotero_collection(zotero_api, collection_name):\n",
    "    '''\n",
    "    Creates a Zotero collection with the given name,\n",
    "    then retrieves the key for that collection.\n",
    "    '''\n",
    "    collection_key = None\n",
    "    if collection_name:\n",
    "        created = zotero_api.create_collections([{'name': collection_name}])\n",
    "        if created:\n",
    "            collections = zotero_api.collections(q=collection_name)\n",
    "            try:\n",
    "                collection_key = collections[0]['key']\n",
    "            except (IndexError, KeyError):\n",
    "                print('Error retrieving collection key.')\n",
    "        else:\n",
    "            print('Error creating collection.')\n",
    "    return collection_key\n",
    "\n",
    "def check_duplicate_collection(zotero_api, collection_name):\n",
    "    '''\n",
    "    Check to see if a collection with the supplied name already exists.\n",
    "    '''\n",
    "    collections = zotero_api.collections(q=collection_name)\n",
    "    if collections:\n",
    "        collection_key = collections[0]['key']\n",
    "    else:\n",
    "        collection_key = None\n",
    "    return collection_key\n",
    "\n",
    "def export_list(list_id, zotero_api, trove_api):\n",
    "    trove_list = trove_api.get_item(item_id=list_id, item_type='list')\n",
    "    list_name = '{} (Trove list: {})'.format(trove_list.get_title(), list_id)\n",
    "    print(list_name)\n",
    "    collection_key = check_duplicate_collection(zotero_api, list_name)\n",
    "    if not collection_key:\n",
    "        collection_key = create_zotero_collection(zotero_api, list_name)\n",
    "    if collection_key:\n",
    "        for item in trove_list.list_items:\n",
    "            if 'deleted' not in item:\n",
    "                details = create_zotero_object(zotero_api, trove_api, item)\n",
    "                zotero_item = details['zotero_item']\n",
    "                zotero_item['collections'] = [collection_key]\n",
    "                response = zotero_api.create_items([zotero_item])\n",
    "                # print(response)\n",
    "                if details['attachments']:\n",
    "                    zotero_api.attachment_simple(details['attachments'], response['successful']['0']['key'])\n",
    "                print('New item added: {}'.format(zotero_item['title']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add your details here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is a temporary group id created for demo purposes, replace it with your own personal or group library id\n",
    "zotero_library_id = '2315662'\n",
    "# Type should be either 'user' or 'group'\n",
    "zotero_library_type = 'group' # either 'user' or 'group'\n",
    "# This is a temporary key created for demonstration purposes\n",
    "zotero_library_key = 'zUjtvoJwuUNTwxKGIbj6t8wt'\n",
    "\n",
    "zot_api = zotero.Zotero(zotero_library_id, zotero_library_type, zotero_library_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace this with the id of the list you want to export\n",
    "trove_list_id = '83777'\n",
    "# This is a temporary key created for demonstration purposes, replace it with your own\n",
    "trove_api_key = 'ju3rgk0jp354ikmh'\n",
    "\n",
    "trove_api = Trove(trove_api_key)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Start the export"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "export_list(trove_list_id, zot_api, trove_api)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}