You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

759 lines
23 KiB

4 years ago
  1. # -*- coding: utf-8 -*-
  2. """
  3. cssselect.parser
  4. ================
  5. Tokenizer, parser and parsed objects for CSS selectors.
  6. :copyright: (c) 2007-2012 Ian Bicking and contributors.
  7. See AUTHORS for more details.
  8. :license: BSD, see LICENSE for more details.
  9. """
  10. import sys
  11. import re
  12. import operator
  13. if sys.version_info[0] < 3:
  14. _unicode = unicode
  15. _unichr = unichr
  16. else:
  17. _unicode = str
  18. _unichr = chr
  19. def ascii_lower(string):
  20. """Lower-case, but only in the ASCII range."""
  21. return string.encode('utf8').lower().decode('utf8')
  22. class SelectorError(Exception):
  23. """Common parent for :class:`SelectorSyntaxError` and
  24. :class:`ExpressionError`.
  25. You can just use ``except SelectorError:`` when calling
  26. :meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types.
  27. """
  28. class SelectorSyntaxError(SelectorError, SyntaxError):
  29. """Parsing a selector that does not match the grammar."""
  30. #### Parsed objects
  31. class Selector(object):
  32. """
  33. Represents a parsed selector.
  34. :meth:`~GenericTranslator.selector_to_xpath` accepts this object,
  35. but ignores :attr:`pseudo_element`. It is the users responsibility
  36. to account for pseudo-elements and reject selectors with unknown
  37. or unsupported pseudo-elements.
  38. """
  39. def __init__(self, tree, pseudo_element=None):
  40. self.parsed_tree = tree
  41. if pseudo_element is not None and not isinstance(
  42. pseudo_element, FunctionalPseudoElement):
  43. pseudo_element = ascii_lower(pseudo_element)
  44. #: A :class:`FunctionalPseudoElement`,
  45. #: or the identifier for the pseudo-element as a string,
  46. # or ``None``.
  47. #:
  48. #: +-------------------------+----------------+--------------------------------+
  49. #: | | Selector | Pseudo-element |
  50. #: +=========================+================+================================+
  51. #: | CSS3 syntax | ``a::before`` | ``'before'`` |
  52. #: +-------------------------+----------------+--------------------------------+
  53. #: | Older syntax | ``a:before`` | ``'before'`` |
  54. #: +-------------------------+----------------+--------------------------------+
  55. #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` |
  56. #: | not in Selectors3 | | |
  57. #: +-------------------------+----------------+--------------------------------+
  58. #: | Invalid pseudo-class | ``li:marker`` | ``None`` |
  59. #: +-------------------------+----------------+--------------------------------+
  60. #: | Functinal | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` |
  61. #: +-------------------------+----------------+--------------------------------+
  62. #:
  63. #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement
  64. self.pseudo_element = pseudo_element
  65. def __repr__(self):
  66. if isinstance(self.pseudo_element, FunctionalPseudoElement):
  67. pseudo_element = repr(self.pseudo_element)
  68. elif self.pseudo_element:
  69. pseudo_element = '::%s' % self.pseudo_element
  70. else:
  71. pseudo_element = ''
  72. return '%s[%r%s]' % (
  73. self.__class__.__name__, self.parsed_tree, pseudo_element)
  74. def specificity(self):
  75. """Return the specificity_ of this selector as a tuple of 3 integers.
  76. .. _specificity: http://www.w3.org/TR/selectors/#specificity
  77. """
  78. a, b, c = self.parsed_tree.specificity()
  79. if self.pseudo_element:
  80. c += 1
  81. return a, b, c
  82. class Class(object):
  83. """
  84. Represents selector.class_name
  85. """
  86. def __init__(self, selector, class_name):
  87. self.selector = selector
  88. self.class_name = class_name
  89. def __repr__(self):
  90. return '%s[%r.%s]' % (
  91. self.__class__.__name__, self.selector, self.class_name)
  92. def specificity(self):
  93. a, b, c = self.selector.specificity()
  94. b += 1
  95. return a, b, c
  96. class FunctionalPseudoElement(object):
  97. """
  98. Represents selector::name(arguments)
  99. .. attribute:: name
  100. The name (identifier) of the pseudo-element, as a string.
  101. .. attribute:: arguments
  102. The arguments of the pseudo-element, as a list of tokens.
  103. **Note:** tokens are not part of the public API,
  104. and may change between cssselect versions.
  105. Use at your own risks.
  106. """
  107. def __init__(self, name, arguments):
  108. self.name = ascii_lower(name)
  109. self.arguments = arguments
  110. def __repr__(self):
  111. return '%s[::%s(%r)]' % (
  112. self.__class__.__name__, self.name,
  113. [token.value for token in self.arguments])
  114. def argument_types(self):
  115. return [token.type for token in self.arguments]
  116. def specificity(self):
  117. a, b, c = self.selector.specificity()
  118. b += 1
  119. return a, b, c
  120. class Function(object):
  121. """
  122. Represents selector:name(expr)
  123. """
  124. def __init__(self, selector, name, arguments):
  125. self.selector = selector
  126. self.name = ascii_lower(name)
  127. self.arguments = arguments
  128. def __repr__(self):
  129. return '%s[%r:%s(%r)]' % (
  130. self.__class__.__name__, self.selector, self.name,
  131. [token.value for token in self.arguments])
  132. def argument_types(self):
  133. return [token.type for token in self.arguments]
  134. def specificity(self):
  135. a, b, c = self.selector.specificity()
  136. b += 1
  137. return a, b, c
  138. class Pseudo(object):
  139. """
  140. Represents selector:ident
  141. """
  142. def __init__(self, selector, ident):
  143. self.selector = selector
  144. self.ident = ascii_lower(ident)
  145. def __repr__(self):
  146. return '%s[%r:%s]' % (
  147. self.__class__.__name__, self.selector, self.ident)
  148. def specificity(self):
  149. a, b, c = self.selector.specificity()
  150. b += 1
  151. return a, b, c
  152. class Negation(object):
  153. """
  154. Represents selector:not(subselector)
  155. """
  156. def __init__(self, selector, subselector):
  157. self.selector = selector
  158. self.subselector = subselector
  159. def __repr__(self):
  160. return '%s[%r:not(%r)]' % (
  161. self.__class__.__name__, self.selector, self.subselector)
  162. def specificity(self):
  163. a1, b1, c1 = self.selector.specificity()
  164. a2, b2, c2 = self.subselector.specificity()
  165. return a1 + a2, b1 + b2, c1 + c2
  166. class Attrib(object):
  167. """
  168. Represents selector[namespace|attrib operator value]
  169. """
  170. def __init__(self, selector, namespace, attrib, operator, value):
  171. self.selector = selector
  172. self.namespace = namespace
  173. self.attrib = attrib
  174. self.operator = operator
  175. self.value = value
  176. def __repr__(self):
  177. if self.namespace:
  178. attrib = '%s|%s' % (self.namespace, self.attrib)
  179. else:
  180. attrib = self.attrib
  181. if self.operator == 'exists':
  182. return '%s[%r[%s]]' % (
  183. self.__class__.__name__, self.selector, attrib)
  184. else:
  185. return '%s[%r[%s %s %r]]' % (
  186. self.__class__.__name__, self.selector, attrib,
  187. self.operator, self.value)
  188. def specificity(self):
  189. a, b, c = self.selector.specificity()
  190. b += 1
  191. return a, b, c
  192. class Element(object):
  193. """
  194. Represents namespace|element
  195. `None` is for the universal selector '*'
  196. """
  197. def __init__(self, namespace=None, element=None):
  198. self.namespace = namespace
  199. self.element = element
  200. def __repr__(self):
  201. element = self.element or '*'
  202. if self.namespace:
  203. element = '%s|%s' % (self.namespace, element)
  204. return '%s[%s]' % (self.__class__.__name__, element)
  205. def specificity(self):
  206. if self.element:
  207. return 0, 0, 1
  208. else:
  209. return 0, 0, 0
  210. class Hash(object):
  211. """
  212. Represents selector#id
  213. """
  214. def __init__(self, selector, id):
  215. self.selector = selector
  216. self.id = id
  217. def __repr__(self):
  218. return '%s[%r#%s]' % (
  219. self.__class__.__name__, self.selector, self.id)
  220. def specificity(self):
  221. a, b, c = self.selector.specificity()
  222. a += 1
  223. return a, b, c
  224. class CombinedSelector(object):
  225. def __init__(self, selector, combinator, subselector):
  226. assert selector is not None
  227. self.selector = selector
  228. self.combinator = combinator
  229. self.subselector = subselector
  230. def __repr__(self):
  231. if self.combinator == ' ':
  232. comb = '<followed>'
  233. else:
  234. comb = self.combinator
  235. return '%s[%r %s %r]' % (
  236. self.__class__.__name__, self.selector, comb, self.subselector)
  237. def specificity(self):
  238. a1, b1, c1 = self.selector.specificity()
  239. a2, b2, c2 = self.subselector.specificity()
  240. return a1 + a2, b1 + b2, c1 + c2
  241. #### Parser
  242. # foo
  243. _el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$')
  244. # foo#bar or #bar
  245. _id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$')
  246. # foo.bar or .bar
  247. _class_re = re.compile(
  248. r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$')
  249. def parse(css):
  250. """Parse a CSS *group of selectors*.
  251. If you don't care about pseudo-elements or selector specificity,
  252. you can skip this and use :meth:`~GenericTranslator.css_to_xpath`.
  253. :param css:
  254. A *group of selectors* as an Unicode string.
  255. :raises:
  256. :class:`SelectorSyntaxError` on invalid selectors.
  257. :returns:
  258. A list of parsed :class:`Selector` objects, one for each
  259. selector in the comma-separated group.
  260. """
  261. # Fast path for simple cases
  262. match = _el_re.match(css)
  263. if match:
  264. return [Selector(Element(element=match.group(1)))]
  265. match = _id_re.match(css)
  266. if match is not None:
  267. return [Selector(Hash(Element(element=match.group(1) or None),
  268. match.group(2)))]
  269. match = _class_re.match(css)
  270. if match is not None:
  271. return [Selector(Class(Element(element=match.group(1) or None),
  272. match.group(2)))]
  273. stream = TokenStream(tokenize(css))
  274. stream.source = css
  275. return list(parse_selector_group(stream))
  276. # except SelectorSyntaxError:
  277. # e = sys.exc_info()[1]
  278. # message = "%s at %s -> %r" % (
  279. # e, stream.used, stream.peek())
  280. # e.msg = message
  281. # e.args = tuple([message])
  282. # raise
  283. def parse_selector_group(stream):
  284. stream.skip_whitespace()
  285. while 1:
  286. yield Selector(*parse_selector(stream))
  287. if stream.peek() == ('DELIM', ','):
  288. stream.next()
  289. stream.skip_whitespace()
  290. else:
  291. break
  292. def parse_selector(stream):
  293. result, pseudo_element = parse_simple_selector(stream)
  294. while 1:
  295. stream.skip_whitespace()
  296. peek = stream.peek()
  297. if peek in (('EOF', None), ('DELIM', ',')):
  298. break
  299. if pseudo_element:
  300. raise SelectorSyntaxError(
  301. 'Got pseudo-element ::%s not at the end of a selector'
  302. % pseudo_element)
  303. if peek.is_delim('+', '>', '~'):
  304. # A combinator
  305. combinator = stream.next().value
  306. stream.skip_whitespace()
  307. else:
  308. # By exclusion, the last parse_simple_selector() ended
  309. # at peek == ' '
  310. combinator = ' '
  311. next_selector, pseudo_element = parse_simple_selector(stream)
  312. result = CombinedSelector(result, combinator, next_selector)
  313. return result, pseudo_element
  314. def parse_simple_selector(stream, inside_negation=False):
  315. stream.skip_whitespace()
  316. selector_start = len(stream.used)
  317. peek = stream.peek()
  318. if peek.type == 'IDENT' or peek == ('DELIM', '*'):
  319. if peek.type == 'IDENT':
  320. namespace = stream.next().value
  321. else:
  322. stream.next()
  323. namespace = None
  324. if stream.peek() == ('DELIM', '|'):
  325. stream.next()
  326. element = stream.next_ident_or_star()
  327. else:
  328. element = namespace
  329. namespace = None
  330. else:
  331. element = namespace = None
  332. result = Element(namespace, element)
  333. pseudo_element = None
  334. while 1:
  335. peek = stream.peek()
  336. if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or (
  337. inside_negation and peek == ('DELIM', ')')):
  338. break
  339. if pseudo_element:
  340. raise SelectorSyntaxError(
  341. 'Got pseudo-element ::%s not at the end of a selector'
  342. % pseudo_element)
  343. if peek.type == 'HASH':
  344. result = Hash(result, stream.next().value)
  345. elif peek == ('DELIM', '.'):
  346. stream.next()
  347. result = Class(result, stream.next_ident())
  348. elif peek == ('DELIM', '['):
  349. stream.next()
  350. result = parse_attrib(result, stream)
  351. elif peek == ('DELIM', ':'):
  352. stream.next()
  353. if stream.peek() == ('DELIM', ':'):
  354. stream.next()
  355. pseudo_element = stream.next_ident()
  356. if stream.peek() == ('DELIM', '('):
  357. stream.next()
  358. pseudo_element = FunctionalPseudoElement(
  359. pseudo_element, parse_arguments(stream))
  360. continue
  361. ident = stream.next_ident()
  362. if ident.lower() in ('first-line', 'first-letter',
  363. 'before', 'after'):
  364. # Special case: CSS 2.1 pseudo-elements can have a single ':'
  365. # Any new pseudo-element must have two.
  366. pseudo_element = _unicode(ident)
  367. continue
  368. if stream.peek() != ('DELIM', '('):
  369. result = Pseudo(result, ident)
  370. continue
  371. stream.next()
  372. stream.skip_whitespace()
  373. if ident.lower() == 'not':
  374. if inside_negation:
  375. raise SelectorSyntaxError('Got nested :not()')
  376. argument, argument_pseudo_element = parse_simple_selector(
  377. stream, inside_negation=True)
  378. next = stream.next()
  379. if argument_pseudo_element:
  380. raise SelectorSyntaxError(
  381. 'Got pseudo-element ::%s inside :not() at %s'
  382. % (argument_pseudo_element, next.pos))
  383. if next != ('DELIM', ')'):
  384. raise SelectorSyntaxError("Expected ')', got %s" % (next,))
  385. result = Negation(result, argument)
  386. else:
  387. result = Function(result, ident, parse_arguments(stream))
  388. else:
  389. raise SelectorSyntaxError(
  390. "Expected selector, got %s" % (peek,))
  391. if len(stream.used) == selector_start:
  392. raise SelectorSyntaxError(
  393. "Expected selector, got %s" % (stream.peek(),))
  394. return result, pseudo_element
  395. def parse_arguments(stream):
  396. arguments = []
  397. while 1:
  398. stream.skip_whitespace()
  399. next = stream.next()
  400. if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [
  401. ('DELIM', '+'), ('DELIM', '-')]:
  402. arguments.append(next)
  403. elif next == ('DELIM', ')'):
  404. return arguments
  405. else:
  406. raise SelectorSyntaxError(
  407. "Expected an argument, got %s" % (next,))
  408. def parse_attrib(selector, stream):
  409. stream.skip_whitespace()
  410. attrib = stream.next_ident_or_star()
  411. if attrib is None and stream.peek() != ('DELIM', '|'):
  412. raise SelectorSyntaxError(
  413. "Expected '|', got %s" % (stream.peek(),))
  414. if stream.peek() == ('DELIM', '|'):
  415. stream.next()
  416. if stream.peek() == ('DELIM', '='):
  417. namespace = None
  418. stream.next()
  419. op = '|='
  420. else:
  421. namespace = attrib
  422. attrib = stream.next_ident()
  423. op = None
  424. else:
  425. namespace = op = None
  426. if op is None:
  427. stream.skip_whitespace()
  428. next = stream.next()
  429. if next == ('DELIM', ']'):
  430. return Attrib(selector, namespace, attrib, 'exists', None)
  431. elif next == ('DELIM', '='):
  432. op = '='
  433. elif next.is_delim('^', '$', '*', '~', '|', '!') and (
  434. stream.peek() == ('DELIM', '=')):
  435. op = next.value + '='
  436. stream.next()
  437. else:
  438. raise SelectorSyntaxError(
  439. "Operator expected, got %s" % (next,))
  440. stream.skip_whitespace()
  441. value = stream.next()
  442. if value.type not in ('IDENT', 'STRING'):
  443. raise SelectorSyntaxError(
  444. "Expected string or ident, got %s" % (value,))
  445. stream.skip_whitespace()
  446. next = stream.next()
  447. if next != ('DELIM', ']'):
  448. raise SelectorSyntaxError(
  449. "Expected ']', got %s" % (next,))
  450. return Attrib(selector, namespace, attrib, op, value.value)
  451. def parse_series(tokens):
  452. """
  453. Parses the arguments for :nth-child() and friends.
  454. :raises: A list of tokens
  455. :returns: :``(a, b)``
  456. """
  457. for token in tokens:
  458. if token.type == 'STRING':
  459. raise ValueError('String tokens not allowed in series.')
  460. s = ''.join(token.value for token in tokens).strip()
  461. if s == 'odd':
  462. return 2, 1
  463. elif s == 'even':
  464. return 2, 0
  465. elif s == 'n':
  466. return 1, 0
  467. if 'n' not in s:
  468. # Just b
  469. return 0, int(s)
  470. a, b = s.split('n', 1)
  471. if not a:
  472. a = 1
  473. elif a == '-' or a == '+':
  474. a = int(a+'1')
  475. else:
  476. a = int(a)
  477. if not b:
  478. b = 0
  479. else:
  480. b = int(b)
  481. return a, b
  482. #### Token objects
  483. class Token(tuple):
  484. def __new__(cls, type_, value, pos):
  485. obj = tuple.__new__(cls, (type_, value))
  486. obj.pos = pos
  487. return obj
  488. def __repr__(self):
  489. return "<%s '%s' at %i>" % (self.type, self.value, self.pos)
  490. def is_delim(self, *values):
  491. return self.type == 'DELIM' and self.value in values
  492. type = property(operator.itemgetter(0))
  493. value = property(operator.itemgetter(1))
  494. class EOFToken(Token):
  495. def __new__(cls, pos):
  496. return Token.__new__(cls, 'EOF', None, pos)
  497. def __repr__(self):
  498. return '<%s at %i>' % (self.type, self.pos)
  499. #### Tokenizer
  500. class TokenMacros:
  501. unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?'
  502. escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]'
  503. string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape
  504. nonascii = r'[^\0-\177]'
  505. nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii)
  506. nmstart = '[_a-z]|%s|%s' % (escape, nonascii)
  507. def _compile(pattern):
  508. return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match
  509. _match_whitespace = _compile(r'[ \t\r\n\f]+')
  510. _match_number = _compile(r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)')
  511. _match_hash = _compile('#(?:%(nmchar)s)+')
  512. _match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*')
  513. _match_string_by_quote = {
  514. "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"),
  515. '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'),
  516. }
  517. _sub_simple_escape = re.compile(r'\\(.)').sub
  518. _sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub
  519. _sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub
  520. # Same as r'\1', but faster on CPython
  521. _replace_simple = operator.methodcaller('group', 1)
  522. def _replace_unicode(match):
  523. codepoint = int(match.group(1), 16)
  524. if codepoint > sys.maxunicode:
  525. codepoint = 0xFFFD
  526. return _unichr(codepoint)
  527. def unescape_ident(value):
  528. value = _sub_unicode_escape(_replace_unicode, value)
  529. value = _sub_simple_escape(_replace_simple, value)
  530. return value
  531. def tokenize(s):
  532. pos = 0
  533. len_s = len(s)
  534. while pos < len_s:
  535. match = _match_whitespace(s, pos=pos)
  536. if match:
  537. yield Token('S', ' ', pos)
  538. pos = match.end()
  539. continue
  540. match = _match_ident(s, pos=pos)
  541. if match:
  542. value = _sub_simple_escape(_replace_simple,
  543. _sub_unicode_escape(_replace_unicode, match.group()))
  544. yield Token('IDENT', value, pos)
  545. pos = match.end()
  546. continue
  547. match = _match_hash(s, pos=pos)
  548. if match:
  549. value = _sub_simple_escape(_replace_simple,
  550. _sub_unicode_escape(_replace_unicode, match.group()[1:]))
  551. yield Token('HASH', value, pos)
  552. pos = match.end()
  553. continue
  554. quote = s[pos]
  555. if quote in _match_string_by_quote:
  556. match = _match_string_by_quote[quote](s, pos=pos + 1)
  557. assert match, 'Should have found at least an empty match'
  558. end_pos = match.end()
  559. if end_pos == len_s:
  560. raise SelectorSyntaxError('Unclosed string at %s' % pos)
  561. if s[end_pos] != quote:
  562. raise SelectorSyntaxError('Invalid string at %s' % pos)
  563. value = _sub_simple_escape(_replace_simple,
  564. _sub_unicode_escape(_replace_unicode,
  565. _sub_newline_escape('', match.group())))
  566. yield Token('STRING', value, pos)
  567. pos = end_pos + 1
  568. continue
  569. match = _match_number(s, pos=pos)
  570. if match:
  571. value = match.group()
  572. yield Token('NUMBER', value, pos)
  573. pos = match.end()
  574. continue
  575. pos2 = pos + 2
  576. if s[pos:pos2] == '/*':
  577. pos = s.find('*/', pos2)
  578. if pos == -1:
  579. pos = len_s
  580. else:
  581. pos += 2
  582. continue
  583. yield Token('DELIM', s[pos], pos)
  584. pos += 1
  585. assert pos == len_s
  586. yield EOFToken(pos)
  587. class TokenStream(object):
  588. def __init__(self, tokens, source=None):
  589. self.used = []
  590. self.tokens = iter(tokens)
  591. self.source = source
  592. self.peeked = None
  593. self._peeking = False
  594. try:
  595. self.next_token = self.tokens.next
  596. except AttributeError:
  597. # Python 3
  598. self.next_token = self.tokens.__next__
  599. def next(self):
  600. if self._peeking:
  601. self._peeking = False
  602. self.used.append(self.peeked)
  603. return self.peeked
  604. else:
  605. next = self.next_token()
  606. self.used.append(next)
  607. return next
  608. def peek(self):
  609. if not self._peeking:
  610. self.peeked = self.next_token()
  611. self._peeking = True
  612. return self.peeked
  613. def next_ident(self):
  614. next = self.next()
  615. if next.type != 'IDENT':
  616. raise SelectorSyntaxError('Expected ident, got %s' % (next,))
  617. return next.value
  618. def next_ident_or_star(self):
  619. next = self.next()
  620. if next.type == 'IDENT':
  621. return next.value
  622. elif next == ('DELIM', '*'):
  623. return None
  624. else:
  625. raise SelectorSyntaxError(
  626. "Expected ident or '*', got %s" % (next,))
  627. def skip_whitespace(self):
  628. peek = self.peek()
  629. if peek.type == 'S':
  630. self.next()