#!/usr/bin/env python # coding=utf-8 # The MIT License (MIT) # # Copyright (c) 2013-2017 Alberto Pettarin (alberto@albertopettarin.it) # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """ Export annotations and highlights from a Kobo SQLite file. """ from __future__ import absolute_import from __future__ import print_function import argparse import datetime import csv import io import os import sqlite3 import sys __author__ = "Alberto Pettarin" __email__ = "alberto@albertopettarin.it" __copyright__ = "Copyright 2013-2017, Alberto Pettarin (www.albertopettarin.it)" __license__ = "MIT" __status__ = "Production" __version__ = "2.1.1" PY2 = (sys.version_info[0] == 2) DAYS = [ u"Monday", u"Tuesday", u"Wednesday", u"Thursday", u"Friday", u"Saturday", u"Sunday", ] MONTHS = [ u"January", u"February", u"March", u"April", u"May", u"June", u"July", u"August", u"September", u"October", u"November", u"December", ] class CommandLineTool(object): """ A class providing a generic command line tool, with the associated functions, error reporting, etc. It is based on ``argparse``. """ # overload in the actual subclass # AP_PROGRAM = sys.argv[0] AP_DESCRIPTION = u"Generic Command Line Tool" AP_ARGUMENTS = [ # required args # {"name": "foo", "nargs": 1, "type": str, "default": "baz", "help": "Foo help"}, # # optional args # {"name": "--bar", "nargs": "?", "type": str,, "default": "foofoofoo", "help": "Bar help"}, # {"name": "--quiet", "action": "store_true", "help": "Do not output to stdout"}, ] def __init__(self): self.parser = argparse.ArgumentParser( prog=self.AP_PROGRAM, description=self.AP_DESCRIPTION ) self.vargs = None for arg in self.AP_ARGUMENTS: if "action" in arg: self.parser.add_argument( arg["name"], action=arg["action"], help=arg["help"] ) else: self.parser.add_argument( arg["name"], nargs=arg["nargs"], type=arg["type"], default=arg["default"], help=arg["help"] ) def run(self): """ Run the command line tool. """ self.vargs = vars(self.parser.parse_args()) self.actual_command() sys.exit(0) def actual_command(self): """ The actual command to be run. This function is meant to be overridden in an actual subclass. """ self.print_stdout(u"This script does nothing. Invoke another .py") def error(self, message): """ Print an error and exit with exit code 1. """ self.print_stderr(u"ERROR: %s" % message) sys.exit(1) def print_stdout(self, *args, **kwargs): """ Print to standard out. """ print(*args, **kwargs) def print_stderr(self, *args, **kwargs): """ Print to standard error. """ print(*args, file=sys.stderr, **kwargs) class Item(object): """ A class representing one of: annotation, bookmark, or highlight. It is basically a named tuple, with some extra functions to format the contents. """ ANNOTATION = "annotation" BOOKMARK = "bookmark" HIGHLIGHT = "highlight" def __init__(self, values): self.volumeid = values[0] self.text = values[1] self.annotation = values[2] self.extraannotationdata = values[3] self.datecreated = values[4] if values[4] is not None else u"1970-01-01T00:00:00.000" self.datemodified = values[5] if values[5] is not None else u"1970-01-01T00:00:00.000" self.booktitle = values[6] self.title = values[7] self.author = values[8] self.kind = self.BOOKMARK if (self.text is not None) and (self.text != "") and (self.annotation is not None) and (self.annotation != ""): self.kind = self.ANNOTATION elif (self.text is not None) and (self.text != ""): self.kind = self.HIGHLIGHT def csv_tuple(self): """ Return a tuple representing this Item, for CSV-output purposes. """ return (self.kind, self.title, self.author, self.datecreated, self.datemodified, self.annotation, self.text) def kindle_my_clippings(self): """ Return a string representing this Item, in the Kindle "My Clippings" format. """ def kindle_date(date_string): d = u"Thursday, 1 January 1970 00:00:00" try: p1, p2 = date_string.split("T") year, month, day = [int(x) for x in p1.split("-")] hour, minute, second = [int(float(x)) for x in p2.split(":")] sday = DAYS[datetime.datetime(year=year, month=month, day=day).weekday()] smonth = MONTHS[month - 1] # e.g. u"Friday, 19 December 2014 19:54:11" d = u"%s, %d %s %d %02d:%02d:%02d" % (sday, day, smonth, year, hour, minute, second) except: pass return d date = kindle_date(self.datecreated) acc = [] acc.append(u"%s (%s)" % (self.title, self.author)) if self.kind == self.ANNOTATION: acc.append(u"- Your Note on page %d | location %d | Added on %s" % (1, 1, date)) acc.append(u"") acc.append(self.annotation) elif self.kind == self.HIGHLIGHT: acc.append(u"- Your Highlight on page %d | location %d | Added on %s" % (1, 1, date)) acc.append(u"") acc.append(self.text) else: acc.append(u"- Your Bookmark on page %d | location %d | Added on %s" % (1, 1, date)) acc.append(u"") acc.append(u"==========") return u"\n".join(acc) def __repr__(self): return u"(%s, %s, %s, %s, %s, %s, %s)" % self.csv_tuple() def __str__(self): acc = [] sep = u"\n=== === ===\n" if self.kind == self.ANNOTATION: acc.append(u"Type: %s" % (self.kind)) acc.append(u"Title: %s" % (self.title)) acc.append(u"Author: %s" % (self.author)) acc.append(u"Date created: %s" % (self.datecreated)) acc.append(u"Annotation:%s%s%s" % (sep, self.annotation, sep)) acc.append(u"Reference text:%s%s%s" % (sep, self.text, sep)) if self.kind == self.HIGHLIGHT: acc.append(u"Type: %s" % (self.kind)) acc.append(u"Title: %s" % (self.title)) acc.append(u"Author: %s" % (self.author)) acc.append(u"Date created: %s" % (self.datecreated)) acc.append(u"Reference text:%s%s%s" % (sep, self.text, sep)) return u"\n".join(acc) class Book(object): """ A class representing a book. It is basically a named tuple, with some extra functions to format the contents. """ def __init__(self, values): self.volumeid = values[0] self.booktitle = values[1] self.title = values[2] self.author = values[3] def __repr__(self): return u"(%s, %s, %s, %s)" % (self.volumeid, self.booktitle, self.title, self.author) def __str__(self): return self.__repr__() class ExportKobo(CommandLineTool): """ The actual command line tool to export annotations, bookmarks, and highlights from a Kobo SQLite file. """ AP_PROGRAM = u"export-kobo" AP_DESCRIPTION = u"Export annotations and highlights from a Kobo SQLite file." AP_ARGUMENTS = [ { "name": "db", "nargs": None, "type": str, "default": None, "help": "Path of the input KoboReader.sqlite file" }, { "name": "--output", "nargs": "?", "type": str, "default": None, "help": "Output to file instead of using the standard output" }, { "name": "--csv", "action": "store_true", "help": "Output in CSV format instead of human-readable format" }, { "name": "--kindle", "action": "store_true", "help": "Output in Kindle 'My Clippings.txt' format instead of human-readable format" }, { "name": "--list", "action": "store_true", "help": "List the titles of books with annotations or highlights" }, { "name": "--book", "nargs": "?", "type": str, "default": None, "help": "Output annotations and highlights only from the book with the given title" }, { "name": "--bookid", "nargs": "?", "type": str, "default": None, "help": "Output annotations and highlights only from the book with the given ID" }, { "name": "--annotations-only", "action": "store_true", "help": "Outputs annotations only, excluding highlights" }, { "name": "--highlights-only", "action": "store_true", "help": "Outputs highlights only, excluding annotations" }, { "name": "--info", "action": "store_true", "help": "Print information about the number of annotations and highlights" }, { "name": "--raw", "action": "store_true", "help": "Output in raw text instead of human-readable format" }, ] # NOTE: not a tuple, just a continuation string! QUERY_ITEMS = ( "SELECT " "Bookmark.VolumeID, " "Bookmark.Text, " "Bookmark.Annotation, " "Bookmark.ExtraAnnotationData, " "Bookmark.DateCreated, " "Bookmark.DateModified, " "content.BookTitle, " "content.Title, " "content.Attribution " "FROM Bookmark INNER JOIN content " "ON Bookmark.VolumeID = content.ContentID;" ) # NOTE: not a tuple, just a continuation string! QUERY_BOOKS = ( "SELECT DISTINCT " "Bookmark.VolumeID, " "content.BookTitle, " "content.Title, " "content.Attribution " "FROM Bookmark INNER JOIN content " "ON Bookmark.VolumeID = content.ContentID " "ORDER BY content.Title;" ) def __init__(self): super(ExportKobo, self).__init__() self.items = [] def actual_command(self): """ The main function of the tool: parse the parameters, read the given SQLite file, and format/output data as requested. """ if self.vargs["db"] is None: self.error(u"You must specify the path to your KoboReader.sqlite file.") books = self.enumerate_books() if self.vargs["list"]: # export list of books acc = [] acc.append((u"ID", u"TITLE", u"AUTHOR")) for (i, b) in books: acc.append((i, b.title, b.author)) if self.vargs["csv"]: acc = self.list_to_csv(acc) else: acc = u"\n".join([(u"%s\t%s\t%s" % (i, t, a)) for (i, t, a) in acc]) else: # export annotations and/or highlights items = self.read_items() if self.vargs["kindle"]: # kindle format acc = u"\n".join([i.kindle_my_clippings() for i in items]) elif self.vargs["csv"]: # CSV format acc = self.list_to_csv([i.csv_tuple() for i in items]) elif self.vargs["raw"]: acc = u"\n".join([(u"%s\n" % i.text) for i in items]) else: # human-readable format acc = u"\n".join([(u"%s\n" % i) for i in items]) if self.vargs["output"] is not None: # write to file try: with io.open(self.vargs["output"], "w", encoding="utf-8") as f: f.write(acc) except IOError: self.error(u"Unable to write output file. Please check that the path is correct and that you have write permission on it.") else: # write to stdout try: self.print_stdout(acc) except UnicodeEncodeError: self.print_stdout(acc.encode("ascii", errors="replace")) if self.vargs["info"]: # print some info about the extraction self.print_stdout(u"") self.print_stdout(u"Books with annotations or highlights: %d" % len(books)) if not self.vargs["list"]: self.print_stdout(u"Annotations and/or highlights: %d" % len(items)) def list_to_csv(self, data): """ Convert the given Item data into a well-formed CSV string. """ if PY2: # PY2 output = io.BytesIO() else: # PY3 output = io.StringIO() writer = csv.writer(output) for d in data: try: writer.writerow(d) except UnicodeEncodeError: writer.writerow(tuple([(v.encode("ascii", errors="replace") if v is not None else "") for v in d])) if PY2: # PY2 return output.getvalue().decode("utf-8") else: # PY3 return output.getvalue() def enumerate_books(self): """ Return a list of pairs ``(int, Book)``, with the index starting at one. """ books = [Book(d) for d in self.query(self.QUERY_BOOKS)] return list(enumerate(books, start=1)) def volumeid_from_bookid(self): """ Get the correct ``volumeid`` from the ``bookid``, that is, the index of the book as produced by the ``enumerate_books()``. """ enum = self.enumerate_books() bookid = self.vargs["bookid"] try: return enum[int(bookid) - 1][1].volumeid except: self.error(u"The bookid value must be an integer between 1 and %d" % (len(enum))) def read_items(self): """ Query the SQLite file, filtering Item objects as specified by the user. """ items = [Item(d) for d in self.query(self.QUERY_ITEMS)] if len(items) == 0: return items if (self.vargs["bookid"] is not None) and (self.vargs["book"] is not None): self.error(u"You cannot specify both --book and --bookid.") if self.vargs["bookid"] is not None: items = [i for i in items if i.volumeid == self.volumeid_from_bookid()] if self.vargs["book"] is not None: items = [i for i in items if i.title == self.vargs["book"]] if self.vargs["highlights_only"]: items = [i for i in items if i.kind == Item.HIGHLIGHT] if self.vargs["annotations_only"]: items = [i for i in items if i.kind == Item.ANNOTATION] return items def query(self, query): """ Run the given query over the SQLite file. """ db_path = self.vargs["db"] if not os.path.exists(db_path): self.error(u"Unable to read the KoboReader.sqlite file. Please check that the path is correct and that you have read permission on it.") try: sql_connection = sqlite3.connect(db_path) sql_cursor = sql_connection.cursor() sql_cursor.execute(query) data = sql_cursor.fetchall() sql_cursor.close() sql_connection.close() except Exception as exc: self.error(u"Unexpected error reading your KoboReader.sqlite file: %s" % (exc)) # NOTE the values are Unicode strings (unicode on PY2, str on PY3) # hence data is a list of tuples of Unicode strings return data def main(): ExportKobo().run() if __name__ == "__main__": main()