# # ElementTree # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ # # limited xpath support for element trees # # history: # 2003-05-23 fl created # 2003-05-28 fl added support for // etc # 2003-08-27 fl fixed parsing of periods in element names # 2007-09-10 fl new selection engine # 2007-09-12 fl fixed parent selector # 2007-09-13 fl added iterfind; changed findall to return a list # 2007-11-30 fl added namespaces support # 2009-10-30 fl added child element value filter # # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # http://www.pythonware.com # # -------------------------------------------------------------------- # The ElementTree toolkit is # # Copyright (c) 1999-2009 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, # and will comply with the following terms and conditions: # # Permission to use, copy, modify, and distribute this software and # its associated documentation for any purpose and without fee is # hereby granted, provided that the above copyright notice appears in # all copies, and that both that copyright notice and this permission # notice appear in supporting documentation, and that the name of # Secret Labs AB or the author not be used in advertising or publicity # pertaining to distribution of the software without specific, written # prior permission. # # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THIS SOFTWARE. # -------------------------------------------------------------------- # Licensed to PSF under a Contributor Agreement. # See http://www.python.org/psf/license for licensing details. ## # Implementation module for XPath support. There's usually no reason # to import this module directly; the ElementTree does this for # you, if needed. ## import re xpath_tokenizer_re = re.compile( "(" "'[^']*'|\"[^\"]*\"|" "::|" "//?|" "\.\.|" "|" "[/.*:\[\]@=])|" "((?:\{[^}]+\})?[^/\[\]@=\s]+)|" "\s+" ) def xpath_tokenizer(pattern, namespaces=None): for token in xpath_tokenizer_re.findall(pattern): tag = token[1] if tag and tag[0] != "{" and ":" in tag: try: prefix, uri = tag.split(":", 1) if not namespaces: raise KeyError yield token[0], "{%s}%s" % (namespaces[prefix], uri) except KeyError: raise SyntaxError("prefix %r not found in prefix map" % prefix) else: yield token def get_parent_map(context): parent_map = context.parent_map if parent_map is None: context.parent_map = parent_map = {} for p in context.root.iter(): for e in p: parent_map[e] = p return parent_map def prepare_child(next, token): tag = token[1] def select(context, result): for elem in result: for e in elem: if e.tag == tag: yield e return select def prepare_star(next, token): def select(context, result): for elem in result: for e in elem: yield e return select def prepare_self(next, token): def select(context, result): for elem in result: yield elem return select def prepare_descendant(next, token): token = next() if token[0] == "*": tag = "*" elif not token[0]: tag = token[1] else: raise SyntaxError("invalid descendant") def select(context, result): for elem in result: for e in elem.iter(tag): if e is not elem: yield e return select def prepare_parent(next, token): def select(context, result): # FIXME: raise error if .. is applied at toplevel? parent_map = get_parent_map(context) result_map = {} for elem in result: if elem in parent_map: parent = parent_map[elem] if parent not in result_map: result_map[parent] = None yield parent return select def prepare_predicate(next, token): # FIXME: replace with real parser!!! refs: # http://effbot.org/zone/simple-iterator-parser.htm # http://javascript.crockford.com/tdop/tdop.html signature = [] predicate = [] while 1: token = next() if token[0] == "]": break if token[0] and token[0][:1] in "'\"": token = "'", token[0][1:-1] signature.append(token[0] or "-") predicate.append(token[1]) signature = "".join(signature) # use signature to determine predicate type if signature == "@-": # [@attribute] predicate key = predicate[1] def select(context, result): for elem in result: if elem.get(key) is not None: yield elem return select if signature == "@-='": # [@attribute='value'] key = predicate[1] value = predicate[-1] def select(context, result): for elem in result: if elem.get(key) == value: yield elem return select if signature == "-" and not re.match("\d+$", predicate[0]): # [tag] tag = predicate[0] def select(context, result): for elem in result: if elem.find(tag) is not None: yield elem return select if signature == "-='" and not re.match("\d+$", predicate[0]): # [tag='value'] tag = predicate[0] value = predicate[-1] def select(context, result): for elem in result: for e in elem.findall(tag): if "".join(e.itertext()) == value: yield elem break return select if signature == "-" or signature == "-()" or signature == "-()-": # [index] or [last()] or [last()-index] if signature == "-": index = int(predicate[0]) - 1 else: if predicate[0] != "last": raise SyntaxError("unsupported function") if signature == "-()-": try: index = int(predicate[2]) - 1 except ValueError: raise SyntaxError("unsupported expression") else: index = -1 def select(context, result): parent_map = get_parent_map(context) for elem in result: try: parent = parent_map[elem] # FIXME: what if the selector is "*" ? elems = list(parent.findall(elem.tag)) if elems[index] is elem: yield elem except (IndexError, KeyError): pass return select raise SyntaxError("invalid predicate") ops = { "": prepare_child, "*": prepare_star, ".": prepare_self, "..": prepare_parent, "//": prepare_descendant, "[": prepare_predicate, } _cache = {} class _SelectorContext: parent_map = None def __init__(self, root): self.root = root # -------------------------------------------------------------------- ## # Generate all matching objects. def iterfind(elem, path, namespaces=None): # compile selector pattern if path[-1:] == "/": path = path + "*" # implicit all (FIXME: keep this?) try: selector = _cache[path] except KeyError: if len(_cache) > 100: _cache.clear() if path[:1] == "/": raise SyntaxError("cannot use absolute path on element") next = iter(xpath_tokenizer(path, namespaces)).next token = next() selector = [] while 1: try: selector.append(ops[token[0]](next, token)) except StopIteration: raise SyntaxError("invalid path") try: token = next() if token[0] == "/": token = next() except StopIteration: break _cache[path] = selector # execute selector pattern result = [elem] context = _SelectorContext(elem) for select in selector: result = select(context, result) return result ## # Find first matching object. def find(elem, path, namespaces=None): try: return iterfind(elem, path, namespaces).next() except StopIteration: return None ## # Find all matching objects. def findall(elem, path, namespaces=None): return list(iterfind(elem, path, namespaces)) ## # Find text for first matching object. def findtext(elem, path, default=None, namespaces=None): try: elem = iterfind(elem, path, namespaces).next() return elem.text or "" except StopIteration: return default