You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

333 lines
9.9 KiB

4 years ago
  1. #
  2. # ElementTree
  3. # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
  4. #
  5. # limited xpath support for element trees
  6. #
  7. # history:
  8. # 2003-05-23 fl created
  9. # 2003-05-28 fl added support for // etc
  10. # 2003-08-27 fl fixed parsing of periods in element names
  11. # 2007-09-10 fl new selection engine
  12. # 2007-09-12 fl fixed parent selector
  13. # 2007-09-13 fl added iterfind; changed findall to return a list
  14. # 2007-11-30 fl added namespaces support
  15. # 2009-10-30 fl added child element value filter
  16. #
  17. # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
  18. #
  19. # fredrik@pythonware.com
  20. # http://www.pythonware.com
  21. #
  22. # --------------------------------------------------------------------
  23. # The ElementTree toolkit is
  24. #
  25. # Copyright (c) 1999-2009 by Fredrik Lundh
  26. #
  27. # By obtaining, using, and/or copying this software and/or its
  28. # associated documentation, you agree that you have read, understood,
  29. # and will comply with the following terms and conditions:
  30. #
  31. # Permission to use, copy, modify, and distribute this software and
  32. # its associated documentation for any purpose and without fee is
  33. # hereby granted, provided that the above copyright notice appears in
  34. # all copies, and that both that copyright notice and this permission
  35. # notice appear in supporting documentation, and that the name of
  36. # Secret Labs AB or the author not be used in advertising or publicity
  37. # pertaining to distribution of the software without specific, written
  38. # prior permission.
  39. #
  40. # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  41. # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  42. # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  43. # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  44. # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  45. # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  46. # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  47. # OF THIS SOFTWARE.
  48. # --------------------------------------------------------------------
  49. ##
  50. # Implementation module for XPath support. There's usually no reason
  51. # to import this module directly; the <b>ElementTree</b> does this for
  52. # you, if needed.
  53. ##
  54. import re
  55. xpath_tokenizer_re = re.compile(
  56. "("
  57. "'[^']*'|\"[^\"]*\"|"
  58. "::|"
  59. "//?|"
  60. r"\.\.|"
  61. r"\(\)|"
  62. r"[/.*:\[\]\(\)@=])|"
  63. r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
  64. r"\s+"
  65. )
  66. def xpath_tokenizer(pattern, namespaces=None):
  67. default_namespace = namespaces.get(None) if namespaces else None
  68. for token in xpath_tokenizer_re.findall(pattern):
  69. tag = token[1]
  70. if tag and tag[0] != "{":
  71. if ":" in tag:
  72. prefix, uri = tag.split(":", 1)
  73. try:
  74. if not namespaces:
  75. raise KeyError
  76. yield token[0], "{%s}%s" % (namespaces[prefix], uri)
  77. except KeyError:
  78. raise SyntaxError("prefix %r not found in prefix map" % prefix)
  79. elif default_namespace:
  80. yield token[0], "{%s}%s" % (default_namespace, tag)
  81. else:
  82. yield token
  83. else:
  84. yield token
  85. def prepare_child(next, token):
  86. tag = token[1]
  87. def select(result):
  88. for elem in result:
  89. for e in elem.iterchildren(tag):
  90. yield e
  91. return select
  92. def prepare_star(next, token):
  93. def select(result):
  94. for elem in result:
  95. for e in elem.iterchildren('*'):
  96. yield e
  97. return select
  98. def prepare_self(next, token):
  99. def select(result):
  100. return result
  101. return select
  102. def prepare_descendant(next, token):
  103. token = next()
  104. if token[0] == "*":
  105. tag = "*"
  106. elif not token[0]:
  107. tag = token[1]
  108. else:
  109. raise SyntaxError("invalid descendant")
  110. def select(result):
  111. for elem in result:
  112. for e in elem.iterdescendants(tag):
  113. yield e
  114. return select
  115. def prepare_parent(next, token):
  116. def select(result):
  117. for elem in result:
  118. parent = elem.getparent()
  119. if parent is not None:
  120. yield parent
  121. return select
  122. def prepare_predicate(next, token):
  123. # FIXME: replace with real parser!!! refs:
  124. # http://effbot.org/zone/simple-iterator-parser.htm
  125. # http://javascript.crockford.com/tdop/tdop.html
  126. signature = ''
  127. predicate = []
  128. while 1:
  129. token = next()
  130. if token[0] == "]":
  131. break
  132. if token == ('', ''):
  133. # ignore whitespace
  134. continue
  135. if token[0] and token[0][:1] in "'\"":
  136. token = "'", token[0][1:-1]
  137. signature += token[0] or "-"
  138. predicate.append(token[1])
  139. # use signature to determine predicate type
  140. if signature == "@-":
  141. # [@attribute] predicate
  142. key = predicate[1]
  143. def select(result):
  144. for elem in result:
  145. if elem.get(key) is not None:
  146. yield elem
  147. return select
  148. if signature == "@-='":
  149. # [@attribute='value']
  150. key = predicate[1]
  151. value = predicate[-1]
  152. def select(result):
  153. for elem in result:
  154. if elem.get(key) == value:
  155. yield elem
  156. return select
  157. if signature == "-" and not re.match(r"-?\d+$", predicate[0]):
  158. # [tag]
  159. tag = predicate[0]
  160. def select(result):
  161. for elem in result:
  162. for _ in elem.iterchildren(tag):
  163. yield elem
  164. break
  165. return select
  166. if signature == ".='" or (signature == "-='" and not re.match(r"-?\d+$", predicate[0])):
  167. # [.='value'] or [tag='value']
  168. tag = predicate[0]
  169. value = predicate[-1]
  170. if tag:
  171. def select(result):
  172. for elem in result:
  173. for e in elem.iterchildren(tag):
  174. if "".join(e.itertext()) == value:
  175. yield elem
  176. break
  177. else:
  178. def select(result):
  179. for elem in result:
  180. if "".join(elem.itertext()) == value:
  181. yield elem
  182. return select
  183. if signature == "-" or signature == "-()" or signature == "-()-":
  184. # [index] or [last()] or [last()-index]
  185. if signature == "-":
  186. # [index]
  187. index = int(predicate[0]) - 1
  188. if index < 0:
  189. if index == -1:
  190. raise SyntaxError(
  191. "indices in path predicates are 1-based, not 0-based")
  192. else:
  193. raise SyntaxError("path index >= 1 expected")
  194. else:
  195. if predicate[0] != "last":
  196. raise SyntaxError("unsupported function")
  197. if signature == "-()-":
  198. try:
  199. index = int(predicate[2]) - 1
  200. except ValueError:
  201. raise SyntaxError("unsupported expression")
  202. else:
  203. index = -1
  204. def select(result):
  205. for elem in result:
  206. parent = elem.getparent()
  207. if parent is None:
  208. continue
  209. try:
  210. # FIXME: what if the selector is "*" ?
  211. elems = list(parent.iterchildren(elem.tag))
  212. if elems[index] is elem:
  213. yield elem
  214. except IndexError:
  215. pass
  216. return select
  217. raise SyntaxError("invalid predicate")
  218. ops = {
  219. "": prepare_child,
  220. "*": prepare_star,
  221. ".": prepare_self,
  222. "..": prepare_parent,
  223. "//": prepare_descendant,
  224. "[": prepare_predicate,
  225. }
  226. # --------------------------------------------------------------------
  227. _cache = {}
  228. def _build_path_iterator(path, namespaces):
  229. """compile selector pattern"""
  230. if path[-1:] == "/":
  231. path += "*" # implicit all (FIXME: keep this?)
  232. cache_key = (path,)
  233. if namespaces:
  234. if '' in namespaces:
  235. raise ValueError("empty namespace prefix must be passed as None, not the empty string")
  236. if None in namespaces:
  237. cache_key += (namespaces[None],) + tuple(sorted(
  238. item for item in namespaces.items() if item[0] is not None))
  239. else:
  240. cache_key += tuple(sorted(namespaces.items()))
  241. try:
  242. return _cache[cache_key]
  243. except KeyError:
  244. pass
  245. if len(_cache) > 100:
  246. _cache.clear()
  247. if path[:1] == "/":
  248. raise SyntaxError("cannot use absolute path on element")
  249. stream = iter(xpath_tokenizer(path, namespaces))
  250. try:
  251. _next = stream.next
  252. except AttributeError:
  253. # Python 3
  254. _next = stream.__next__
  255. try:
  256. token = _next()
  257. except StopIteration:
  258. raise SyntaxError("empty path expression")
  259. selector = []
  260. while 1:
  261. try:
  262. selector.append(ops[token[0]](_next, token))
  263. except StopIteration:
  264. raise SyntaxError("invalid path")
  265. try:
  266. token = _next()
  267. if token[0] == "/":
  268. token = _next()
  269. except StopIteration:
  270. break
  271. _cache[cache_key] = selector
  272. return selector
  273. ##
  274. # Iterate over the matching nodes
  275. def iterfind(elem, path, namespaces=None):
  276. selector = _build_path_iterator(path, namespaces)
  277. result = iter((elem,))
  278. for select in selector:
  279. result = select(result)
  280. return result
  281. ##
  282. # Find first matching object.
  283. def find(elem, path, namespaces=None):
  284. it = iterfind(elem, path, namespaces)
  285. try:
  286. return next(it)
  287. except StopIteration:
  288. return None
  289. ##
  290. # Find all matching objects.
  291. def findall(elem, path, namespaces=None):
  292. return list(iterfind(elem, path, namespaces))
  293. ##
  294. # Find text for first matching object.
  295. def findtext(elem, path, default=None, namespaces=None):
  296. el = find(elem, path, namespaces)
  297. if el is None:
  298. return default
  299. else:
  300. return el.text or ''