""" Generate an excerpt from either the summary or a content of an entry. Parameters: width: maximum number of characters in the excerpt. Default: 500 omit: whitespace delimited list of html tags to remove. Default: none target: name of element created. Default: planet:excerpt Notes: * if 'img' is in the list of tags to be omitted tags are replaced with hypertext links associated with the value of the 'alt' attribute. If there is no alt attribute value, is used instead. If the parent element of the img tag is already an tag, no additional hypertext links are added. """ import sys, xml.dom.minidom, textwrap from xml.dom import Node, minidom atomNS = 'http://www.w3.org/2005/Atom' planetNS = 'http://planet.intertwingly.net/' args = dict(zip([name.lstrip('-') for name in sys.argv[1::2]], sys.argv[2::2])) wrapper = textwrap.TextWrapper(width=int(args.get('width','500'))) omit = args.get('omit', '').split() target = args.get('target', 'planet:excerpt') class copy: """ recursively copy a source to a target, up to a given width """ def __init__(self, dom, source, target): self.dom = dom self.full = False self.text = [] self.textlen = 0 self.copyChildren(source, target) def copyChildren(self, source, target): """ copy child nodes of a source to the target """ for child in source.childNodes: if child.nodeType == Node.ELEMENT_NODE: self.copyElement(child, target) elif child.nodeType == Node.TEXT_NODE: self.copyText(child.data, target) if self.full: break def copyElement(self, source, target): """ copy source element to the target """ # check the omit list if source.nodeName in omit: if source.nodeName == 'img': return self.elideImage(source, target) return self.copyChildren(source, target) # copy element, attributes, and children child = self.dom.createElementNS(source.namespaceURI, source.nodeName) target.appendChild(child) for i in range(0, source.attributes.length): attr = source.attributes.item(i) child.setAttributeNS(attr.namespaceURI, attr.name, attr.value) self.copyChildren(source, child) def elideImage(self, source, target): """ copy an elided form of the image element to the target """ alt = source.getAttribute('alt') or '' src = source.getAttribute('src') if target.nodeName == 'a' or not src: self.copyText(alt, target) else: child = self.dom.createElement('a') child.setAttribute('href', src) self.copyText(alt, child) target.appendChild(child) def copyText(self, source, target): """ copy text to the target, until the point where it would wrap """ if not source.isspace() and source.strip(): self.text.append(source.strip()) lines = wrapper.wrap(' '.join(self.text)) if len(lines) == 1: target.appendChild(self.dom.createTextNode(source)) self.textlen = len(lines[0]) elif lines: excerpt = source[:len(lines[0])-self.textlen] + u' \u2026' target.appendChild(dom.createTextNode(excerpt)) self.full = True # select summary or content element dom = minidom.parse(sys.stdin) source = dom.getElementsByTagNameNS(atomNS, 'summary') if not source: source = dom.getElementsByTagNameNS(atomNS, 'content') # if present, recursively copy it to a planet:excerpt element if source: if target.startswith('planet:'): dom.documentElement.setAttribute('xmlns:planet', planetNS) if target.startswith('atom:'): target = target.split(':',1)[1] excerpt = dom.createElementNS(planetNS, target) source[0].parentNode.appendChild(excerpt) copy(dom, source[0], excerpt) if source[0].nodeName == excerpt.nodeName: source[0].parentNode.removeChild(source[0]) # print out results print dom.toxml('utf-8')