|
|
- #
- # ElementTree
- # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
- #
- # limited xpath support for element trees
- #
- # history:
- # 2003-05-23 fl created
- # 2003-05-28 fl added support for // etc
- # 2003-08-27 fl fixed parsing of periods in element names
- # 2007-09-10 fl new selection engine
- # 2007-09-12 fl fixed parent selector
- # 2007-09-13 fl added iterfind; changed findall to return a list
- # 2007-11-30 fl added namespaces support
- # 2009-10-30 fl added child element value filter
- #
- # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
- #
- # fredrik@pythonware.com
- # http://www.pythonware.com
- #
- # --------------------------------------------------------------------
- # The ElementTree toolkit is
- #
- # Copyright (c) 1999-2009 by Fredrik Lundh
- #
- # By obtaining, using, and/or copying this software and/or its
- # associated documentation, you agree that you have read, understood,
- # and will comply with the following terms and conditions:
- #
- # Permission to use, copy, modify, and distribute this software and
- # its associated documentation for any purpose and without fee is
- # hereby granted, provided that the above copyright notice appears in
- # all copies, and that both that copyright notice and this permission
- # notice appear in supporting documentation, and that the name of
- # Secret Labs AB or the author not be used in advertising or publicity
- # pertaining to distribution of the software without specific, written
- # prior permission.
- #
- # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
- # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
- # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
- # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- # OF THIS SOFTWARE.
- # --------------------------------------------------------------------
-
- ##
- # Implementation module for XPath support. There's usually no reason
- # to import this module directly; the <b>ElementTree</b> does this for
- # you, if needed.
- ##
-
- import re
-
- xpath_tokenizer_re = re.compile(
- "("
- "'[^']*'|\"[^\"]*\"|"
- "::|"
- "//?|"
- r"\.\.|"
- r"\(\)|"
- r"[/.*:\[\]\(\)@=])|"
- r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
- r"\s+"
- )
-
- def xpath_tokenizer(pattern, namespaces=None):
- default_namespace = namespaces.get(None) if namespaces else None
- for token in xpath_tokenizer_re.findall(pattern):
- tag = token[1]
- if tag and tag[0] != "{":
- if ":" in tag:
- prefix, uri = tag.split(":", 1)
- try:
- if not namespaces:
- raise KeyError
- yield token[0], "{%s}%s" % (namespaces[prefix], uri)
- except KeyError:
- raise SyntaxError("prefix %r not found in prefix map" % prefix)
- elif default_namespace:
- yield token[0], "{%s}%s" % (default_namespace, tag)
- else:
- yield token
- else:
- yield token
-
-
- def prepare_child(next, token):
- tag = token[1]
- def select(result):
- for elem in result:
- for e in elem.iterchildren(tag):
- yield e
- return select
-
- def prepare_star(next, token):
- def select(result):
- for elem in result:
- for e in elem.iterchildren('*'):
- yield e
- return select
-
- def prepare_self(next, token):
- def select(result):
- return result
- return select
-
- def prepare_descendant(next, token):
- token = next()
- if token[0] == "*":
- tag = "*"
- elif not token[0]:
- tag = token[1]
- else:
- raise SyntaxError("invalid descendant")
- def select(result):
- for elem in result:
- for e in elem.iterdescendants(tag):
- yield e
- return select
-
- def prepare_parent(next, token):
- def select(result):
- for elem in result:
- parent = elem.getparent()
- if parent is not None:
- yield parent
- return select
-
- def prepare_predicate(next, token):
- # FIXME: replace with real parser!!! refs:
- # http://effbot.org/zone/simple-iterator-parser.htm
- # http://javascript.crockford.com/tdop/tdop.html
- signature = ''
- predicate = []
- while 1:
- token = next()
- if token[0] == "]":
- break
- if token == ('', ''):
- # ignore whitespace
- continue
- if token[0] and token[0][:1] in "'\"":
- token = "'", token[0][1:-1]
- signature += token[0] or "-"
- predicate.append(token[1])
-
- # use signature to determine predicate type
- if signature == "@-":
- # [@attribute] predicate
- key = predicate[1]
- def select(result):
- for elem in result:
- if elem.get(key) is not None:
- yield elem
- return select
- if signature == "@-='":
- # [@attribute='value']
- key = predicate[1]
- value = predicate[-1]
- def select(result):
- for elem in result:
- if elem.get(key) == value:
- yield elem
- return select
- if signature == "-" and not re.match(r"-?\d+$", predicate[0]):
- # [tag]
- tag = predicate[0]
- def select(result):
- for elem in result:
- for _ in elem.iterchildren(tag):
- yield elem
- break
- return select
- if signature == ".='" or (signature == "-='" and not re.match(r"-?\d+$", predicate[0])):
- # [.='value'] or [tag='value']
- tag = predicate[0]
- value = predicate[-1]
- if tag:
- def select(result):
- for elem in result:
- for e in elem.iterchildren(tag):
- if "".join(e.itertext()) == value:
- yield elem
- break
- else:
- def select(result):
- for elem in result:
- if "".join(elem.itertext()) == value:
- yield elem
- return select
- if signature == "-" or signature == "-()" or signature == "-()-":
- # [index] or [last()] or [last()-index]
- if signature == "-":
- # [index]
- index = int(predicate[0]) - 1
- if index < 0:
- if index == -1:
- raise SyntaxError(
- "indices in path predicates are 1-based, not 0-based")
- else:
- raise SyntaxError("path index >= 1 expected")
- else:
- if predicate[0] != "last":
- raise SyntaxError("unsupported function")
- if signature == "-()-":
- try:
- index = int(predicate[2]) - 1
- except ValueError:
- raise SyntaxError("unsupported expression")
- else:
- index = -1
- def select(result):
- for elem in result:
- parent = elem.getparent()
- if parent is None:
- continue
- try:
- # FIXME: what if the selector is "*" ?
- elems = list(parent.iterchildren(elem.tag))
- if elems[index] is elem:
- yield elem
- except IndexError:
- pass
- return select
- raise SyntaxError("invalid predicate")
-
- ops = {
- "": prepare_child,
- "*": prepare_star,
- ".": prepare_self,
- "..": prepare_parent,
- "//": prepare_descendant,
- "[": prepare_predicate,
- }
-
-
- # --------------------------------------------------------------------
-
- _cache = {}
-
-
- def _build_path_iterator(path, namespaces):
- """compile selector pattern"""
- if path[-1:] == "/":
- path += "*" # implicit all (FIXME: keep this?)
-
- cache_key = (path,)
- if namespaces:
- if '' in namespaces:
- raise ValueError("empty namespace prefix must be passed as None, not the empty string")
- if None in namespaces:
- cache_key += (namespaces[None],) + tuple(sorted(
- item for item in namespaces.items() if item[0] is not None))
- else:
- cache_key += tuple(sorted(namespaces.items()))
-
- try:
- return _cache[cache_key]
- except KeyError:
- pass
- if len(_cache) > 100:
- _cache.clear()
-
- if path[:1] == "/":
- raise SyntaxError("cannot use absolute path on element")
- stream = iter(xpath_tokenizer(path, namespaces))
- try:
- _next = stream.next
- except AttributeError:
- # Python 3
- _next = stream.__next__
- try:
- token = _next()
- except StopIteration:
- raise SyntaxError("empty path expression")
- selector = []
- while 1:
- try:
- selector.append(ops[token[0]](_next, token))
- except StopIteration:
- raise SyntaxError("invalid path")
- try:
- token = _next()
- if token[0] == "/":
- token = _next()
- except StopIteration:
- break
- _cache[cache_key] = selector
- return selector
-
-
- ##
- # Iterate over the matching nodes
-
- def iterfind(elem, path, namespaces=None):
- selector = _build_path_iterator(path, namespaces)
- result = iter((elem,))
- for select in selector:
- result = select(result)
- return result
-
-
- ##
- # Find first matching object.
-
- def find(elem, path, namespaces=None):
- it = iterfind(elem, path, namespaces)
- try:
- return next(it)
- except StopIteration:
- return None
-
-
- ##
- # Find all matching objects.
-
- def findall(elem, path, namespaces=None):
- return list(iterfind(elem, path, namespaces))
-
-
- ##
- # Find text for first matching object.
-
- def findtext(elem, path, default=None, namespaces=None):
- el = find(elem, path, namespaces)
- if el is None:
- return default
- else:
- return el.text or ''
|