ElementPath.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. #
  2. # ElementTree
  3. # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
  4. #
  5. # limited xpath support for element trees
  6. #
  7. # history:
  8. # 2003-05-23 fl created
  9. # 2003-05-28 fl added support for // etc
  10. # 2003-08-27 fl fixed parsing of periods in element names
  11. # 2007-09-10 fl new selection engine
  12. # 2007-09-12 fl fixed parent selector
  13. # 2007-09-13 fl added iterfind; changed findall to return a list
  14. # 2007-11-30 fl added namespaces support
  15. # 2009-10-30 fl added child element value filter
  16. #
  17. # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
  18. #
  19. # fredrik@pythonware.com
  20. # http://www.pythonware.com
  21. #
  22. # --------------------------------------------------------------------
  23. # The ElementTree toolkit is
  24. #
  25. # Copyright (c) 1999-2009 by Fredrik Lundh
  26. #
  27. # By obtaining, using, and/or copying this software and/or its
  28. # associated documentation, you agree that you have read, understood,
  29. # and will comply with the following terms and conditions:
  30. #
  31. # Permission to use, copy, modify, and distribute this software and
  32. # its associated documentation for any purpose and without fee is
  33. # hereby granted, provided that the above copyright notice appears in
  34. # all copies, and that both that copyright notice and this permission
  35. # notice appear in supporting documentation, and that the name of
  36. # Secret Labs AB or the author not be used in advertising or publicity
  37. # pertaining to distribution of the software without specific, written
  38. # prior permission.
  39. #
  40. # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  41. # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  42. # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  43. # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  44. # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  45. # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  46. # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  47. # OF THIS SOFTWARE.
  48. # --------------------------------------------------------------------
  49. # Licensed to PSF under a Contributor Agreement.
  50. # See http://www.python.org/psf/license for licensing details.
  51. ##
  52. # Implementation module for XPath support. There's usually no reason
  53. # to import this module directly; the <b>ElementTree</b> does this for
  54. # you, if needed.
  55. ##
  56. import re
  57. xpath_tokenizer_re = re.compile(
  58. r"("
  59. r"'[^']*'|\"[^\"]*\"|"
  60. r"::|"
  61. r"//?|"
  62. r"\.\.|"
  63. r"\(\)|"
  64. r"[/.*:\[\]\(\)@=])|"
  65. r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
  66. r"\s+"
  67. )
  68. def xpath_tokenizer(pattern, namespaces=None):
  69. for token in xpath_tokenizer_re.findall(pattern):
  70. tag = token[1]
  71. if tag and tag[0] != "{" and ":" in tag:
  72. try:
  73. prefix, uri = tag.split(":", 1)
  74. if not namespaces:
  75. raise KeyError
  76. yield token[0], "{%s}%s" % (namespaces[prefix], uri)
  77. except KeyError:
  78. raise SyntaxError("prefix %r not found in prefix map" % prefix) from None
  79. else:
  80. yield token
  81. def get_parent_map(context):
  82. parent_map = context.parent_map
  83. if parent_map is None:
  84. context.parent_map = parent_map = {}
  85. for p in context.root.iter():
  86. for e in p:
  87. parent_map[e] = p
  88. return parent_map
  89. def prepare_child(next, token):
  90. tag = token[1]
  91. def select(context, result):
  92. for elem in result:
  93. for e in elem:
  94. if e.tag == tag:
  95. yield e
  96. return select
  97. def prepare_star(next, token):
  98. def select(context, result):
  99. for elem in result:
  100. yield from elem
  101. return select
  102. def prepare_self(next, token):
  103. def select(context, result):
  104. yield from result
  105. return select
  106. def prepare_descendant(next, token):
  107. try:
  108. token = next()
  109. except StopIteration:
  110. return
  111. if token[0] == "*":
  112. tag = "*"
  113. elif not token[0]:
  114. tag = token[1]
  115. else:
  116. raise SyntaxError("invalid descendant")
  117. def select(context, result):
  118. for elem in result:
  119. for e in elem.iter(tag):
  120. if e is not elem:
  121. yield e
  122. return select
  123. def prepare_parent(next, token):
  124. def select(context, result):
  125. # FIXME: raise error if .. is applied at toplevel?
  126. parent_map = get_parent_map(context)
  127. result_map = {}
  128. for elem in result:
  129. if elem in parent_map:
  130. parent = parent_map[elem]
  131. if parent not in result_map:
  132. result_map[parent] = None
  133. yield parent
  134. return select
  135. def prepare_predicate(next, token):
  136. # FIXME: replace with real parser!!! refs:
  137. # http://effbot.org/zone/simple-iterator-parser.htm
  138. # http://javascript.crockford.com/tdop/tdop.html
  139. signature = []
  140. predicate = []
  141. while 1:
  142. try:
  143. token = next()
  144. except StopIteration:
  145. return
  146. if token[0] == "]":
  147. break
  148. if token == ('', ''):
  149. # ignore whitespace
  150. continue
  151. if token[0] and token[0][:1] in "'\"":
  152. token = "'", token[0][1:-1]
  153. signature.append(token[0] or "-")
  154. predicate.append(token[1])
  155. signature = "".join(signature)
  156. # use signature to determine predicate type
  157. if signature == "@-":
  158. # [@attribute] predicate
  159. key = predicate[1]
  160. def select(context, result):
  161. for elem in result:
  162. if elem.get(key) is not None:
  163. yield elem
  164. return select
  165. if signature == "@-='":
  166. # [@attribute='value']
  167. key = predicate[1]
  168. value = predicate[-1]
  169. def select(context, result):
  170. for elem in result:
  171. if elem.get(key) == value:
  172. yield elem
  173. return select
  174. if signature == "-" and not re.match(r"\-?\d+$", predicate[0]):
  175. # [tag]
  176. tag = predicate[0]
  177. def select(context, result):
  178. for elem in result:
  179. if elem.find(tag) is not None:
  180. yield elem
  181. return select
  182. if signature == ".='" or (signature == "-='" and not re.match(r"\-?\d+$", predicate[0])):
  183. # [.='value'] or [tag='value']
  184. tag = predicate[0]
  185. value = predicate[-1]
  186. if tag:
  187. def select(context, result):
  188. for elem in result:
  189. for e in elem.findall(tag):
  190. if "".join(e.itertext()) == value:
  191. yield elem
  192. break
  193. else:
  194. def select(context, result):
  195. for elem in result:
  196. if "".join(elem.itertext()) == value:
  197. yield elem
  198. return select
  199. if signature == "-" or signature == "-()" or signature == "-()-":
  200. # [index] or [last()] or [last()-index]
  201. if signature == "-":
  202. # [index]
  203. index = int(predicate[0]) - 1
  204. if index < 0:
  205. raise SyntaxError("XPath position >= 1 expected")
  206. else:
  207. if predicate[0] != "last":
  208. raise SyntaxError("unsupported function")
  209. if signature == "-()-":
  210. try:
  211. index = int(predicate[2]) - 1
  212. except ValueError:
  213. raise SyntaxError("unsupported expression")
  214. if index > -2:
  215. raise SyntaxError("XPath offset from last() must be negative")
  216. else:
  217. index = -1
  218. def select(context, result):
  219. parent_map = get_parent_map(context)
  220. for elem in result:
  221. try:
  222. parent = parent_map[elem]
  223. # FIXME: what if the selector is "*" ?
  224. elems = list(parent.findall(elem.tag))
  225. if elems[index] is elem:
  226. yield elem
  227. except (IndexError, KeyError):
  228. pass
  229. return select
  230. raise SyntaxError("invalid predicate")
  231. ops = {
  232. "": prepare_child,
  233. "*": prepare_star,
  234. ".": prepare_self,
  235. "..": prepare_parent,
  236. "//": prepare_descendant,
  237. "[": prepare_predicate,
  238. }
  239. _cache = {}
  240. class _SelectorContext:
  241. parent_map = None
  242. def __init__(self, root):
  243. self.root = root
  244. # --------------------------------------------------------------------
  245. ##
  246. # Generate all matching objects.
  247. def iterfind(elem, path, namespaces=None):
  248. # compile selector pattern
  249. cache_key = (path, None if namespaces is None
  250. else tuple(sorted(namespaces.items())))
  251. if path[-1:] == "/":
  252. path = path + "*" # implicit all (FIXME: keep this?)
  253. try:
  254. selector = _cache[cache_key]
  255. except KeyError:
  256. if len(_cache) > 100:
  257. _cache.clear()
  258. if path[:1] == "/":
  259. raise SyntaxError("cannot use absolute path on element")
  260. next = iter(xpath_tokenizer(path, namespaces)).__next__
  261. try:
  262. token = next()
  263. except StopIteration:
  264. return
  265. selector = []
  266. while 1:
  267. try:
  268. selector.append(ops[token[0]](next, token))
  269. except StopIteration:
  270. raise SyntaxError("invalid path") from None
  271. try:
  272. token = next()
  273. if token[0] == "/":
  274. token = next()
  275. except StopIteration:
  276. break
  277. _cache[cache_key] = selector
  278. # execute selector pattern
  279. result = [elem]
  280. context = _SelectorContext(elem)
  281. for select in selector:
  282. result = select(context, result)
  283. return result
  284. ##
  285. # Find first matching object.
  286. def find(elem, path, namespaces=None):
  287. return next(iterfind(elem, path, namespaces), None)
  288. ##
  289. # Find all matching objects.
  290. def findall(elem, path, namespaces=None):
  291. return list(iterfind(elem, path, namespaces))
  292. ##
  293. # Find text for first matching object.
  294. def findtext(elem, path, default=None, namespaces=None):
  295. try:
  296. elem = next(iterfind(elem, path, namespaces))
  297. return elem.text or ""
  298. except StopIteration:
  299. return default