# -*- coding: utf-8 -*- """ cssselect.parser ================ Tokenizer, parser and parsed objects for CSS selectors. :copyright: (c) 2007-2012 Ian Bicking and contributors. See AUTHORS for more details. :license: BSD, see LICENSE for more details. """ import sys import re import operator if sys.version_info[0] < 3: _unicode = unicode _unichr = unichr else: _unicode = str _unichr = chr def ascii_lower(string): """Lower-case, but only in the ASCII range.""" return string.encode('utf8').lower().decode('utf8') class SelectorError(Exception): """Common parent for :class:`SelectorSyntaxError` and :class:`ExpressionError`. You can just use ``except SelectorError:`` when calling :meth:`~GenericTranslator.css_to_xpath` and handle both exceptions types. """ class SelectorSyntaxError(SelectorError, SyntaxError): """Parsing a selector that does not match the grammar.""" #### Parsed objects class Selector(object): """ Represents a parsed selector. :meth:`~GenericTranslator.selector_to_xpath` accepts this object, but ignores :attr:`pseudo_element`. It is the user’s responsibility to account for pseudo-elements and reject selectors with unknown or unsupported pseudo-elements. """ def __init__(self, tree, pseudo_element=None): self.parsed_tree = tree if pseudo_element is not None and not isinstance( pseudo_element, FunctionalPseudoElement): pseudo_element = ascii_lower(pseudo_element) #: A :class:`FunctionalPseudoElement`, #: or the identifier for the pseudo-element as a string, # or ``None``. #: #: +-------------------------+----------------+--------------------------------+ #: | | Selector | Pseudo-element | #: +=========================+================+================================+ #: | CSS3 syntax | ``a::before`` | ``'before'`` | #: +-------------------------+----------------+--------------------------------+ #: | Older syntax | ``a:before`` | ``'before'`` | #: +-------------------------+----------------+--------------------------------+ #: | From the Lists3_ draft, | ``li::marker`` | ``'marker'`` | #: | not in Selectors3 | | | #: +-------------------------+----------------+--------------------------------+ #: | Invalid pseudo-class | ``li:marker`` | ``None`` | #: +-------------------------+----------------+--------------------------------+ #: | Functinal | ``a::foo(2)`` | ``FunctionalPseudoElement(…)`` | #: +-------------------------+----------------+--------------------------------+ #: #: .. _Lists3: http://www.w3.org/TR/2011/WD-css3-lists-20110524/#marker-pseudoelement self.pseudo_element = pseudo_element def __repr__(self): if isinstance(self.pseudo_element, FunctionalPseudoElement): pseudo_element = repr(self.pseudo_element) elif self.pseudo_element: pseudo_element = '::%s' % self.pseudo_element else: pseudo_element = '' return '%s[%r%s]' % ( self.__class__.__name__, self.parsed_tree, pseudo_element) def specificity(self): """Return the specificity_ of this selector as a tuple of 3 integers. .. _specificity: http://www.w3.org/TR/selectors/#specificity """ a, b, c = self.parsed_tree.specificity() if self.pseudo_element: c += 1 return a, b, c class Class(object): """ Represents selector.class_name """ def __init__(self, selector, class_name): self.selector = selector self.class_name = class_name def __repr__(self): return '%s[%r.%s]' % ( self.__class__.__name__, self.selector, self.class_name) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class FunctionalPseudoElement(object): """ Represents selector::name(arguments) .. attribute:: name The name (identifier) of the pseudo-element, as a string. .. attribute:: arguments The arguments of the pseudo-element, as a list of tokens. **Note:** tokens are not part of the public API, and may change between cssselect versions. Use at your own risks. """ def __init__(self, name, arguments): self.name = ascii_lower(name) self.arguments = arguments def __repr__(self): return '%s[::%s(%r)]' % ( self.__class__.__name__, self.name, [token.value for token in self.arguments]) def argument_types(self): return [token.type for token in self.arguments] def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Function(object): """ Represents selector:name(expr) """ def __init__(self, selector, name, arguments): self.selector = selector self.name = ascii_lower(name) self.arguments = arguments def __repr__(self): return '%s[%r:%s(%r)]' % ( self.__class__.__name__, self.selector, self.name, [token.value for token in self.arguments]) def argument_types(self): return [token.type for token in self.arguments] def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Pseudo(object): """ Represents selector:ident """ def __init__(self, selector, ident): self.selector = selector self.ident = ascii_lower(ident) def __repr__(self): return '%s[%r:%s]' % ( self.__class__.__name__, self.selector, self.ident) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Negation(object): """ Represents selector:not(subselector) """ def __init__(self, selector, subselector): self.selector = selector self.subselector = subselector def __repr__(self): return '%s[%r:not(%r)]' % ( self.__class__.__name__, self.selector, self.subselector) def specificity(self): a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 class Attrib(object): """ Represents selector[namespace|attrib operator value] """ def __init__(self, selector, namespace, attrib, operator, value): self.selector = selector self.namespace = namespace self.attrib = attrib self.operator = operator self.value = value def __repr__(self): if self.namespace: attrib = '%s|%s' % (self.namespace, self.attrib) else: attrib = self.attrib if self.operator == 'exists': return '%s[%r[%s]]' % ( self.__class__.__name__, self.selector, attrib) else: return '%s[%r[%s %s %r]]' % ( self.__class__.__name__, self.selector, attrib, self.operator, self.value) def specificity(self): a, b, c = self.selector.specificity() b += 1 return a, b, c class Element(object): """ Represents namespace|element `None` is for the universal selector '*' """ def __init__(self, namespace=None, element=None): self.namespace = namespace self.element = element def __repr__(self): element = self.element or '*' if self.namespace: element = '%s|%s' % (self.namespace, element) return '%s[%s]' % (self.__class__.__name__, element) def specificity(self): if self.element: return 0, 0, 1 else: return 0, 0, 0 class Hash(object): """ Represents selector#id """ def __init__(self, selector, id): self.selector = selector self.id = id def __repr__(self): return '%s[%r#%s]' % ( self.__class__.__name__, self.selector, self.id) def specificity(self): a, b, c = self.selector.specificity() a += 1 return a, b, c class CombinedSelector(object): def __init__(self, selector, combinator, subselector): assert selector is not None self.selector = selector self.combinator = combinator self.subselector = subselector def __repr__(self): if self.combinator == ' ': comb = '' else: comb = self.combinator return '%s[%r %s %r]' % ( self.__class__.__name__, self.selector, comb, self.subselector) def specificity(self): a1, b1, c1 = self.selector.specificity() a2, b2, c2 = self.subselector.specificity() return a1 + a2, b1 + b2, c1 + c2 #### Parser # foo _el_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]+)[ \t\r\n\f]*$') # foo#bar or #bar _id_re = re.compile(r'^[ \t\r\n\f]*([a-zA-Z]*)#([a-zA-Z0-9_-]+)[ \t\r\n\f]*$') # foo.bar or .bar _class_re = re.compile( r'^[ \t\r\n\f]*([a-zA-Z]*)\.([a-zA-Z][a-zA-Z0-9_-]*)[ \t\r\n\f]*$') def parse(css): """Parse a CSS *group of selectors*. If you don't care about pseudo-elements or selector specificity, you can skip this and use :meth:`~GenericTranslator.css_to_xpath`. :param css: A *group of selectors* as an Unicode string. :raises: :class:`SelectorSyntaxError` on invalid selectors. :returns: A list of parsed :class:`Selector` objects, one for each selector in the comma-separated group. """ # Fast path for simple cases match = _el_re.match(css) if match: return [Selector(Element(element=match.group(1)))] match = _id_re.match(css) if match is not None: return [Selector(Hash(Element(element=match.group(1) or None), match.group(2)))] match = _class_re.match(css) if match is not None: return [Selector(Class(Element(element=match.group(1) or None), match.group(2)))] stream = TokenStream(tokenize(css)) stream.source = css return list(parse_selector_group(stream)) # except SelectorSyntaxError: # e = sys.exc_info()[1] # message = "%s at %s -> %r" % ( # e, stream.used, stream.peek()) # e.msg = message # e.args = tuple([message]) # raise def parse_selector_group(stream): stream.skip_whitespace() while 1: yield Selector(*parse_selector(stream)) if stream.peek() == ('DELIM', ','): stream.next() stream.skip_whitespace() else: break def parse_selector(stream): result, pseudo_element = parse_simple_selector(stream) while 1: stream.skip_whitespace() peek = stream.peek() if peek in (('EOF', None), ('DELIM', ',')): break if pseudo_element: raise SelectorSyntaxError( 'Got pseudo-element ::%s not at the end of a selector' % pseudo_element) if peek.is_delim('+', '>', '~'): # A combinator combinator = stream.next().value stream.skip_whitespace() else: # By exclusion, the last parse_simple_selector() ended # at peek == ' ' combinator = ' ' next_selector, pseudo_element = parse_simple_selector(stream) result = CombinedSelector(result, combinator, next_selector) return result, pseudo_element def parse_simple_selector(stream, inside_negation=False): stream.skip_whitespace() selector_start = len(stream.used) peek = stream.peek() if peek.type == 'IDENT' or peek == ('DELIM', '*'): if peek.type == 'IDENT': namespace = stream.next().value else: stream.next() namespace = None if stream.peek() == ('DELIM', '|'): stream.next() element = stream.next_ident_or_star() else: element = namespace namespace = None else: element = namespace = None result = Element(namespace, element) pseudo_element = None while 1: peek = stream.peek() if peek.type in ('S', 'EOF') or peek.is_delim(',', '+', '>', '~') or ( inside_negation and peek == ('DELIM', ')')): break if pseudo_element: raise SelectorSyntaxError( 'Got pseudo-element ::%s not at the end of a selector' % pseudo_element) if peek.type == 'HASH': result = Hash(result, stream.next().value) elif peek == ('DELIM', '.'): stream.next() result = Class(result, stream.next_ident()) elif peek == ('DELIM', '['): stream.next() result = parse_attrib(result, stream) elif peek == ('DELIM', ':'): stream.next() if stream.peek() == ('DELIM', ':'): stream.next() pseudo_element = stream.next_ident() if stream.peek() == ('DELIM', '('): stream.next() pseudo_element = FunctionalPseudoElement( pseudo_element, parse_arguments(stream)) continue ident = stream.next_ident() if ident.lower() in ('first-line', 'first-letter', 'before', 'after'): # Special case: CSS 2.1 pseudo-elements can have a single ':' # Any new pseudo-element must have two. pseudo_element = _unicode(ident) continue if stream.peek() != ('DELIM', '('): result = Pseudo(result, ident) continue stream.next() stream.skip_whitespace() if ident.lower() == 'not': if inside_negation: raise SelectorSyntaxError('Got nested :not()') argument, argument_pseudo_element = parse_simple_selector( stream, inside_negation=True) next = stream.next() if argument_pseudo_element: raise SelectorSyntaxError( 'Got pseudo-element ::%s inside :not() at %s' % (argument_pseudo_element, next.pos)) if next != ('DELIM', ')'): raise SelectorSyntaxError("Expected ')', got %s" % (next,)) result = Negation(result, argument) else: result = Function(result, ident, parse_arguments(stream)) else: raise SelectorSyntaxError( "Expected selector, got %s" % (peek,)) if len(stream.used) == selector_start: raise SelectorSyntaxError( "Expected selector, got %s" % (stream.peek(),)) return result, pseudo_element def parse_arguments(stream): arguments = [] while 1: stream.skip_whitespace() next = stream.next() if next.type in ('IDENT', 'STRING', 'NUMBER') or next in [ ('DELIM', '+'), ('DELIM', '-')]: arguments.append(next) elif next == ('DELIM', ')'): return arguments else: raise SelectorSyntaxError( "Expected an argument, got %s" % (next,)) def parse_attrib(selector, stream): stream.skip_whitespace() attrib = stream.next_ident_or_star() if attrib is None and stream.peek() != ('DELIM', '|'): raise SelectorSyntaxError( "Expected '|', got %s" % (stream.peek(),)) if stream.peek() == ('DELIM', '|'): stream.next() if stream.peek() == ('DELIM', '='): namespace = None stream.next() op = '|=' else: namespace = attrib attrib = stream.next_ident() op = None else: namespace = op = None if op is None: stream.skip_whitespace() next = stream.next() if next == ('DELIM', ']'): return Attrib(selector, namespace, attrib, 'exists', None) elif next == ('DELIM', '='): op = '=' elif next.is_delim('^', '$', '*', '~', '|', '!') and ( stream.peek() == ('DELIM', '=')): op = next.value + '=' stream.next() else: raise SelectorSyntaxError( "Operator expected, got %s" % (next,)) stream.skip_whitespace() value = stream.next() if value.type not in ('IDENT', 'STRING'): raise SelectorSyntaxError( "Expected string or ident, got %s" % (value,)) stream.skip_whitespace() next = stream.next() if next != ('DELIM', ']'): raise SelectorSyntaxError( "Expected ']', got %s" % (next,)) return Attrib(selector, namespace, attrib, op, value.value) def parse_series(tokens): """ Parses the arguments for :nth-child() and friends. :raises: A list of tokens :returns: :``(a, b)`` """ for token in tokens: if token.type == 'STRING': raise ValueError('String tokens not allowed in series.') s = ''.join(token.value for token in tokens).strip() if s == 'odd': return 2, 1 elif s == 'even': return 2, 0 elif s == 'n': return 1, 0 if 'n' not in s: # Just b return 0, int(s) a, b = s.split('n', 1) if not a: a = 1 elif a == '-' or a == '+': a = int(a+'1') else: a = int(a) if not b: b = 0 else: b = int(b) return a, b #### Token objects class Token(tuple): def __new__(cls, type_, value, pos): obj = tuple.__new__(cls, (type_, value)) obj.pos = pos return obj def __repr__(self): return "<%s '%s' at %i>" % (self.type, self.value, self.pos) def is_delim(self, *values): return self.type == 'DELIM' and self.value in values type = property(operator.itemgetter(0)) value = property(operator.itemgetter(1)) class EOFToken(Token): def __new__(cls, pos): return Token.__new__(cls, 'EOF', None, pos) def __repr__(self): return '<%s at %i>' % (self.type, self.pos) #### Tokenizer class TokenMacros: unicode_escape = r'\\([0-9a-f]{1,6})(?:\r\n|[ \n\r\t\f])?' escape = unicode_escape + r'|\\[^\n\r\f0-9a-f]' string_escape = r'\\(?:\n|\r\n|\r|\f)|' + escape nonascii = r'[^\0-\177]' nmchar = '[_a-z0-9-]|%s|%s' % (escape, nonascii) nmstart = '[_a-z]|%s|%s' % (escape, nonascii) def _compile(pattern): return re.compile(pattern % vars(TokenMacros), re.IGNORECASE).match _match_whitespace = _compile(r'[ \t\r\n\f]+') _match_number = _compile(r'[+-]?(?:[0-9]*\.[0-9]+|[0-9]+)') _match_hash = _compile('#(?:%(nmchar)s)+') _match_ident = _compile('-?(?:%(nmstart)s)(?:%(nmchar)s)*') _match_string_by_quote = { "'": _compile(r"([^\n\r\f\\']|%(string_escape)s)*"), '"': _compile(r'([^\n\r\f\\"]|%(string_escape)s)*'), } _sub_simple_escape = re.compile(r'\\(.)').sub _sub_unicode_escape = re.compile(TokenMacros.unicode_escape, re.I).sub _sub_newline_escape =re.compile(r'\\(?:\n|\r\n|\r|\f)').sub # Same as r'\1', but faster on CPython _replace_simple = operator.methodcaller('group', 1) def _replace_unicode(match): codepoint = int(match.group(1), 16) if codepoint > sys.maxunicode: codepoint = 0xFFFD return _unichr(codepoint) def unescape_ident(value): value = _sub_unicode_escape(_replace_unicode, value) value = _sub_simple_escape(_replace_simple, value) return value def tokenize(s): pos = 0 len_s = len(s) while pos < len_s: match = _match_whitespace(s, pos=pos) if match: yield Token('S', ' ', pos) pos = match.end() continue match = _match_ident(s, pos=pos) if match: value = _sub_simple_escape(_replace_simple, _sub_unicode_escape(_replace_unicode, match.group())) yield Token('IDENT', value, pos) pos = match.end() continue match = _match_hash(s, pos=pos) if match: value = _sub_simple_escape(_replace_simple, _sub_unicode_escape(_replace_unicode, match.group()[1:])) yield Token('HASH', value, pos) pos = match.end() continue quote = s[pos] if quote in _match_string_by_quote: match = _match_string_by_quote[quote](s, pos=pos + 1) assert match, 'Should have found at least an empty match' end_pos = match.end() if end_pos == len_s: raise SelectorSyntaxError('Unclosed string at %s' % pos) if s[end_pos] != quote: raise SelectorSyntaxError('Invalid string at %s' % pos) value = _sub_simple_escape(_replace_simple, _sub_unicode_escape(_replace_unicode, _sub_newline_escape('', match.group()))) yield Token('STRING', value, pos) pos = end_pos + 1 continue match = _match_number(s, pos=pos) if match: value = match.group() yield Token('NUMBER', value, pos) pos = match.end() continue pos2 = pos + 2 if s[pos:pos2] == '/*': pos = s.find('*/', pos2) if pos == -1: pos = len_s else: pos += 2 continue yield Token('DELIM', s[pos], pos) pos += 1 assert pos == len_s yield EOFToken(pos) class TokenStream(object): def __init__(self, tokens, source=None): self.used = [] self.tokens = iter(tokens) self.source = source self.peeked = None self._peeking = False try: self.next_token = self.tokens.next except AttributeError: # Python 3 self.next_token = self.tokens.__next__ def next(self): if self._peeking: self._peeking = False self.used.append(self.peeked) return self.peeked else: next = self.next_token() self.used.append(next) return next def peek(self): if not self._peeking: self.peeked = self.next_token() self._peeking = True return self.peeked def next_ident(self): next = self.next() if next.type != 'IDENT': raise SelectorSyntaxError('Expected ident, got %s' % (next,)) return next.value def next_ident_or_star(self): next = self.next() if next.type == 'IDENT': return next.value elif next == ('DELIM', '*'): return None else: raise SelectorSyntaxError( "Expected ident or '*', got %s" % (next,)) def skip_whitespace(self): peek = self.peek() if peek.type == 'S': self.next()