alpcentaur
/
basabuuka_prototyp

from __future__ import absolute_import, division, unicode_literalsfrom six import text_type
import re
from codecs import register_error, xmlcharrefreplace_errors
from .constants import voidElements, booleanAttributes, spaceCharactersfrom .constants import rcdataElements, entities, xmlEntitiesfrom . import treewalkers, _utilsfrom xml.sax.saxutils import escape
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"                                   "\u3000]")

_encode_entity_map = {}_is_ucs4 = len("\U0010FFFF") == 1for k, v in list(entities.items()):    # skip multi-character entities    if ((_is_ucs4 and len(v) > 1) or            (not _is_ucs4 and len(v) > 2)):        continue    if v != "&":        if len(v) == 2:            v = _utils.surrogatePairToCodepoint(v)        else:            v = ord(v)        if v not in _encode_entity_map or k.islower():            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.            _encode_entity_map[v] = k

def htmlentityreplace_errors(exc):    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):        res = []        codepoints = []        skip = False        for i, c in enumerate(exc.object[exc.start:exc.end]):            if skip:                skip = False                continue            index = i + exc.start            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])                skip = True            else:                codepoint = ord(c)            codepoints.append(codepoint)        for cp in codepoints:            e = _encode_entity_map.get(cp)            if e:                res.append("&")                res.append(e)                if not e.endswith(";"):                    res.append(";")            else:                res.append("&#x%s;" % (hex(cp)[2:]))        return ("".join(res), exc.end)    else:        return xmlcharrefreplace_errors(exc)

register_error("htmlentityreplace", htmlentityreplace_errors)

def serialize(input, tree="etree", encoding=None, **serializer_opts):    """Serializes the input token stream using the specified treewalker

    :arg input: the token stream to serialize
    :arg tree: the treewalker to use
    :arg encoding: the encoding to use
    :arg serializer_opts: any options to pass to the        :py:class:`html5lib.serializer.HTMLSerializer` that gets created
    :returns: the tree serialized as a string
    Example:
    >>> from html5lib.html5parser import parse    >>> from html5lib.serializer import serialize    >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')    >>> serialize(token_stream, omit_optional_tags=False)    '<html><head></head><body><p>Hi!</p></body></html>'
    """
    # XXX: Should we cache this?    walker = treewalkers.getTreeWalker(tree)    s = HTMLSerializer(**serializer_opts)    return s.render(walker(input), encoding)

class HTMLSerializer(object):
    # attribute quoting options    quote_attr_values = "legacy"  # be secure by default    quote_char = '"'    use_best_quote_char = True
    # tag syntax options    omit_optional_tags = True    minimize_boolean_attributes = True    use_trailing_solidus = False    space_before_trailing_solidus = True
    # escaping options    escape_lt_in_attrs = False    escape_rcdata = False    resolve_entities = True
    # miscellaneous options    alphabetical_attributes = False    inject_meta_charset = True    strip_whitespace = False    sanitize = False
    options = ("quote_attr_values", "quote_char", "use_best_quote_char",               "omit_optional_tags", "minimize_boolean_attributes",               "use_trailing_solidus", "space_before_trailing_solidus",               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",               "alphabetical_attributes", "inject_meta_charset",               "strip_whitespace", "sanitize")
    def __init__(self, **kwargs):        """Initialize HTMLSerializer

        :arg inject_meta_charset: Whether or not to inject the meta charset.
            Defaults to ``True``.
        :arg quote_attr_values: Whether to quote attribute values that don't            require quoting per legacy browser behavior (``"legacy"``), when            required by the standard (``"spec"``), or always (``"always"``).
            Defaults to ``"legacy"``.
        :arg quote_char: Use given quote character for attribute quoting.
            Defaults to ``"`` which will use double quotes unless attribute            value contains a double quote, in which case single quotes are            used.
        :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute            values.
            Defaults to ``False``.
        :arg escape_rcdata: Whether to escape characters that need to be            escaped within normal elements within rcdata elements such as            style.
            Defaults to ``False``.
        :arg resolve_entities: Whether to resolve named character entities that            appear in the source tree. The XML predefined entities &lt; &gt;            &amp; &quot; &apos; are unaffected by this setting.
            Defaults to ``True``.
        :arg strip_whitespace: Whether to remove semantically meaningless            whitespace. (This compresses all whitespace to a single space            except within ``pre``.)
            Defaults to ``False``.
        :arg minimize_boolean_attributes: Shortens boolean attributes to give            just the attribute value, for example::
              <input disabled="disabled">
            becomes::
              <input disabled>
            Defaults to ``True``.
        :arg use_trailing_solidus: Includes a close-tag slash at the end of the            start tag of void elements (empty elements whose end tag is            forbidden). E.g. ``<hr/>``.
            Defaults to ``False``.
        :arg space_before_trailing_solidus: Places a space immediately before            the closing slash in a tag using a trailing solidus. E.g.            ``<hr />``. Requires ``use_trailing_solidus=True``.
            Defaults to ``True``.
        :arg sanitize: Strip all unsafe or unknown constructs from output.            See :py:class:`html5lib.filters.sanitizer.Filter`.
            Defaults to ``False``.
        :arg omit_optional_tags: Omit start/end tags that are optional.
            Defaults to ``True``.
        :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
            Defaults to ``False``.
        """
        unexpected_args = frozenset(kwargs) - frozenset(self.options)        if len(unexpected_args) > 0:            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))        if 'quote_char' in kwargs:            self.use_best_quote_char = False        for attr in self.options:            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))        self.errors = []        self.strict = False
    def encode(self, string):        assert(isinstance(string, text_type))        if self.encoding:            return string.encode(self.encoding, "htmlentityreplace")        else:            return string
    def encodeStrict(self, string):        assert(isinstance(string, text_type))        if self.encoding:            return string.encode(self.encoding, "strict")        else:            return string
    def serialize(self, treewalker, encoding=None):        # pylint:disable=too-many-nested-blocks        self.encoding = encoding        in_cdata = False        self.errors = []
        if encoding and self.inject_meta_charset:            from .filters.inject_meta_charset import Filter            treewalker = Filter(treewalker, encoding)        # Alphabetical attributes is here under the assumption that none of        # the later filters add or change order of attributes; it needs to be        # before the sanitizer so escaped elements come out correctly        if self.alphabetical_attributes:            from .filters.alphabeticalattributes import Filter            treewalker = Filter(treewalker)        # WhitespaceFilter should be used before OptionalTagFilter        # for maximum efficiently of this latter filter        if self.strip_whitespace:            from .filters.whitespace import Filter            treewalker = Filter(treewalker)        if self.sanitize:            from .filters.sanitizer import Filter            treewalker = Filter(treewalker)        if self.omit_optional_tags:            from .filters.optionaltags import Filter            treewalker = Filter(treewalker)
        for token in treewalker:            type = token["type"]            if type == "Doctype":                doctype = "<!DOCTYPE %s" % token["name"]
                if token["publicId"]:                    doctype += ' PUBLIC "%s"' % token["publicId"]                elif token["systemId"]:                    doctype += " SYSTEM"                if token["systemId"]:                    if token["systemId"].find('"') >= 0:                        if token["systemId"].find("'") >= 0:                            self.serializeError("System identifer contains both single and double quote characters")                        quote_char = "'"                    else:                        quote_char = '"'                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
                doctype += ">"                yield self.encodeStrict(doctype)
            elif type in ("Characters", "SpaceCharacters"):                if type == "SpaceCharacters" or in_cdata:                    if in_cdata and token["data"].find("</") >= 0:                        self.serializeError("Unexpected </ in CDATA")                    yield self.encode(token["data"])                else:                    yield self.encode(escape(token["data"]))
            elif type in ("StartTag", "EmptyTag"):                name = token["name"]                yield self.encodeStrict("<%s" % name)                if name in rcdataElements and not self.escape_rcdata:                    in_cdata = True                elif in_cdata:                    self.serializeError("Unexpected child element of a CDATA element")                for (_, attr_name), attr_value in token["data"].items():                    # TODO: Add namespace support here                    k = attr_name                    v = attr_value                    yield self.encodeStrict(' ')
                    yield self.encodeStrict(k)                    if not self.minimize_boolean_attributes or \                        (k not in booleanAttributes.get(name, tuple()) and                         k not in booleanAttributes.get("", tuple())):                        yield self.encodeStrict("=")                        if self.quote_attr_values == "always" or len(v) == 0:                            quote_attr = True                        elif self.quote_attr_values == "spec":                            quote_attr = _quoteAttributeSpec.search(v) is not None                        elif self.quote_attr_values == "legacy":                            quote_attr = _quoteAttributeLegacy.search(v) is not None                        else:                            raise ValueError("quote_attr_values must be one of: "                                             "'always', 'spec', or 'legacy'")                        v = v.replace("&", "&amp;")                        if self.escape_lt_in_attrs:                            v = v.replace("<", "&lt;")                        if quote_attr:                            quote_char = self.quote_char                            if self.use_best_quote_char:                                if "'" in v and '"' not in v:                                    quote_char = '"'                                elif '"' in v and "'" not in v:                                    quote_char = "'"                            if quote_char == "'":                                v = v.replace("'", "&#39;")                            else:                                v = v.replace('"', "&quot;")                            yield self.encodeStrict(quote_char)                            yield self.encode(v)                            yield self.encodeStrict(quote_char)                        else:                            yield self.encode(v)                if name in voidElements and self.use_trailing_solidus:                    if self.space_before_trailing_solidus:                        yield self.encodeStrict(" /")                    else:                        yield self.encodeStrict("/")                yield self.encode(">")
            elif type == "EndTag":                name = token["name"]                if name in rcdataElements:                    in_cdata = False                elif in_cdata:                    self.serializeError("Unexpected child element of a CDATA element")                yield self.encodeStrict("</%s>" % name)
            elif type == "Comment":                data = token["data"]                if data.find("--") >= 0:                    self.serializeError("Comment contains --")                yield self.encodeStrict("<!--%s-->" % token["data"])
            elif type == "Entity":                name = token["name"]                key = name + ";"                if key not in entities:                    self.serializeError("Entity %s not recognized" % name)                if self.resolve_entities and key not in xmlEntities:                    data = entities[key]                else:                    data = "&%s;" % name                yield self.encodeStrict(data)
            else:                self.serializeError(token["data"])
    def render(self, treewalker, encoding=None):        """Serializes the stream from the treewalker into a string

        :arg treewalker: the treewalker to serialize
        :arg encoding: the string encoding to use
        :returns: the serialized tree
        Example:
        >>> from html5lib import parse, getTreeWalker        >>> from html5lib.serializer import HTMLSerializer        >>> token_stream = parse('<html><body>Hi!</body></html>')        >>> walker = getTreeWalker('etree')        >>> serializer = HTMLSerializer(omit_optional_tags=False)        >>> serializer.render(walker(token_stream))        '<html><head></head><body>Hi!</body></html>'
        """
        if encoding:            return b"".join(list(self.serialize(treewalker, encoding)))        else:            return "".join(list(self.serialize(treewalker)))
    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):        # XXX The idea is to make data mandatory.        self.errors.append(data)        if self.strict:            raise SerializeError

class SerializeError(Exception):    """Error in serialized tree"""    pass