#!/usr/bin/python3 """ pdfmunge - Process PDFs to make them more legible on eBook readers. Copyright Felix Crux (www.felixcrux.com) Available under the MIT License (see Readme). """ USAGE_STRING = """ Usage: pdfmunge [options]... input_file output_file Options: -r --rotate Slice pages in half and rotate each half 90 degrees counter-clockwise, creating a pseudo-landscape mode on devices that don't do this automatically. Be warned that this will double the size of your output file. -m --margin If using rotation/slicing, have each page overlap with the previous one by this amount (helps with lines getting cut off in the middle). -b --bounds Boundaries of visible area on each PDF page. Useful for cropping off large margins. If this is given, cropping is done automatically; otherwise it is not done. Boundaries should be given as four comma-separated numbers, all enclosed in quotation marks, like so: "10,20,100,120". Any whitespace inside the quotation marks is ignored. -o --oddbounds For PDFs that have different margins on even and odd pages, use these boundary values for odd numbered pages, with --bounds applying to even numbered ones. If this is given, --bounds is required. -e --exclude Numbers or ranges of pages to not include in the output PDF. These should be given as a series of numbers or ranges surrounded by quotation marks, and separated by commas. Any whitespace is ignored. Ranges are given as two numbers separated by a hyphen/minus sign (-), where the first number must be smaller than the second. Example: "1,2,4-8,40". This option takes precedence over --intact. -i --intact Leave these pages completely unchanged, ignoring cropping, rotating, or anything else. Requires a set of numbers or ranges like --exclude. Excluded pages are ignored even if listed here. """ import getopt import PyPDF2 import sys def main(argv): """ Process PDFs to make them more legible on eBook readers. """ try: options = handle_options(argv) except getopt.GetoptError as err: print(str(err)) print(USAGE_STRING) return 2 # Get our inputs and outputs sorted out and opened. try: input_stream = open(options["infile"], "rb") in_stream = PyPDF2.PdfFileReader(input_stream) if options["rotate"] is True: in_stream2 = PyPDF2.PdfFileReader(input_stream) except IOError as err: print("Unable to open input file: %s" % str(err)) return 1 try: output_stream = open(options["outfile"], "wb") output = PyPDF2.PdfFileWriter() except IOError as err: print("Unable to create output file: %s" % str(err)) return 1 # The meat of the program: go over every page performing the user's bidding. page_nums = [x for x in range(in_stream.getNumPages()) if x not in options["exclude"]] for page_num in page_nums: page = in_stream.getPage(page_num) page2 = None if not options["rotate"] else in_stream2.getPage(page_num) if page_num not in options["intact"]: if "bounds" in options: crop(page, page_num, options) crop(page2, page_num, options) if options["rotate"]: rotate(page, page2, options) output.addPage(page) if page2 is not None: output.addPage(page2) else: output.addPage(page) # All right, we're done. Write the output, close up, go home. output.write(output_stream) input_stream.close() output_stream.close() return 0 def crop(page, page_num, options): """ Apply user-specified bounds to the page. """ # Note that (page_num % 2 == 0) is the correct test for odd numbered pages, # since we are using 0-indexed ones, where the user expects 1-indexed. if page is not None: if "oddbounds" in options and (page_num % 2 == 0): bounds = options["oddbounds"] else: bounds = options["bounds"] page.mediaBox = PyPDF2.generic.RectangleObject( [PyPDF2.generic.NumberObject(x) for x in bounds]) def rotate(page, page2, options): """ Perform slicing and rotation on pages. """ bounds = list(page.mediaBox.lowerLeft) + list(page.mediaBox.upperRight) bounds2 = list(page2.mediaBox.lowerLeft) + list(page2.mediaBox.upperRight) bounds[1] = (bounds[3] - bounds[1]) / 2 + bounds[1] - options["margin"] bounds2[3] = (bounds2[3] - bounds2[1]) / 2 + bounds2[1] + options["margin"] page.mediaBox = PyPDF2.generic.RectangleObject( [PyPDF2.generic.NumberObject(x) for x in bounds]) page2.mediaBox = PyPDF2.generic.RectangleObject( [PyPDF2.generic.NumberObject(x) for x in bounds2]) page.rotateCounterClockwise(90) page2.rotateCounterClockwise(90) def handle_options(argv): """ Parse the comamnd-line arguments and populate the options dictionary. All options are optional (as the name tends to suggest), but two arguments are required: an input filename and an output filename. """ options = {"rotate": False, "exclude": [], "intact": [], "margin": 0} opts, args = getopt.getopt(argv, "rb:o:e:i:m:", ["rotate", "bounds=", "oddbounds=", "exclude=", "intact=", "margin="]) for opt, arg in opts: if opt in ("-r", "--rotate"): options["rotate"] = True elif opt in ("-b", "--bounds"): options["bounds"] = parse_bounds(arg) elif opt in ("-o", "--oddbounds"): options["oddbounds"] = parse_bounds(arg) elif opt in ("-e", "--exclude"): options["exclude"] = parse_range(arg) elif opt in ("-i", "--intact"): options["intact"] = parse_range(arg) elif opt in ("-m", "--margin"): options["margin"] = int(arg) else: assert False, "Unhandled Option" try: options["infile"], options["outfile"] = args[0], args[1] except IndexError: raise getopt.GetoptError("Missing input or output filename.") if "oddbounds" in options and "bounds" not in options: raise getopt.GetoptError("Boundaries for even pages required if odd " "page boundaries given.") return options def parse_bounds(bounds_string): """ Given a string representation of four boundary values, return a four-item list representing those numbers. Input values should be separated by commas, with whitespace being ignored. """ return [int(val) for val in bounds_string.split(",")] def parse_range(range_string): """ Return a list of numbers representing the input ranges. Inputs can be individual numbers, or ranges, given by two numbers separated by a hyphen/minus sign (-), with each input separated by a comma. All whitespace is ignored. In range-type inputs, the second number must be larger than the first. Ranges are inclusive of both numbers. Because these numbers represent page numbers, which humans index from 1, but PyPDF2 indexes from 0, the *inputs* are 1-indexed, but the *outputs* are 0-indexed. """ expanded_list = [] ranges = range_string.split(",") for cur_range in ranges: if cur_range.find("-") > -1: start, end = cur_range.split("-") start, end = int(start) - 1, int(end) expanded_list.extend(list(range(start, end))) else: expanded_list.append(int(cur_range) - 1) return expanded_list if __name__ == "__main__": exit(main(sys.argv[1:]))