You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

102 lines
3.3 KiB

4 years ago
  1. """CSS Selectors based on XPath.
  2. This module supports selecting XML/HTML tags based on CSS selectors.
  3. See the `CSSSelector` class for details.
  4. This is a thin wrapper around cssselect 0.7 or later.
  5. """
  6. from __future__ import absolute_import
  7. from . import etree
  8. try:
  9. import cssselect as external_cssselect
  10. except ImportError:
  11. raise ImportError(
  12. 'cssselect does not seem to be installed. '
  13. 'See http://packages.python.org/cssselect/')
  14. SelectorSyntaxError = external_cssselect.SelectorSyntaxError
  15. ExpressionError = external_cssselect.ExpressionError
  16. SelectorError = external_cssselect.SelectorError
  17. __all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
  18. 'CSSSelector']
  19. class LxmlTranslator(external_cssselect.GenericTranslator):
  20. """
  21. A custom CSS selector to XPath translator with lxml-specific extensions.
  22. """
  23. def xpath_contains_function(self, xpath, function):
  24. # Defined there, removed in later drafts:
  25. # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
  26. if function.argument_types() not in (['STRING'], ['IDENT']):
  27. raise ExpressionError(
  28. "Expected a single string or ident for :contains(), got %r"
  29. % function.arguments)
  30. value = function.arguments[0].value
  31. return xpath.add_condition(
  32. 'contains(__lxml_internal_css:lower-case(string(.)), %s)'
  33. % self.xpath_literal(value.lower()))
  34. class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
  35. """
  36. lxml extensions + HTML support.
  37. """
  38. def _make_lower_case(context, s):
  39. return s.lower()
  40. ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
  41. ns.prefix = '__lxml_internal_css'
  42. ns['lower-case'] = _make_lower_case
  43. class CSSSelector(etree.XPath):
  44. """A CSS selector.
  45. Usage::
  46. >>> from lxml import etree, cssselect
  47. >>> select = cssselect.CSSSelector("a tag > child")
  48. >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
  49. >>> [ el.tag for el in select(root) ]
  50. ['child']
  51. To use CSS namespaces, you need to pass a prefix-to-namespace
  52. mapping as ``namespaces`` keyword argument::
  53. >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
  54. >>> select_ns = cssselect.CSSSelector('root > rdf|Description',
  55. ... namespaces={'rdf': rdfns})
  56. >>> rdf = etree.XML((
  57. ... '<root xmlns:rdf="%s">'
  58. ... '<rdf:Description>blah</rdf:Description>'
  59. ... '</root>') % rdfns)
  60. >>> [(el.tag, el.text) for el in select_ns(rdf)]
  61. [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
  62. """
  63. def __init__(self, css, namespaces=None, translator='xml'):
  64. if translator == 'xml':
  65. translator = LxmlTranslator()
  66. elif translator == 'html':
  67. translator = LxmlHTMLTranslator()
  68. elif translator == 'xhtml':
  69. translator = LxmlHTMLTranslator(xhtml=True)
  70. path = translator.css_to_xpath(css)
  71. etree.XPath.__init__(self, path, namespaces=namespaces)
  72. self.css = css
  73. def __repr__(self):
  74. return '<%s %s for %r>' % (
  75. self.__class__.__name__,
  76. hex(abs(id(self)))[2:],
  77. self.css)