You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

409 lines
15 KiB

4 years ago
  1. from __future__ import absolute_import, division, unicode_literals
  2. from six import text_type
  3. import re
  4. from codecs import register_error, xmlcharrefreplace_errors
  5. from .constants import voidElements, booleanAttributes, spaceCharacters
  6. from .constants import rcdataElements, entities, xmlEntities
  7. from . import treewalkers, _utils
  8. from xml.sax.saxutils import escape
  9. _quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
  10. _quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
  11. _quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
  12. "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
  13. "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
  14. "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
  15. "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
  16. "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
  17. "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
  18. "\u3000]")
  19. _encode_entity_map = {}
  20. _is_ucs4 = len("\U0010FFFF") == 1
  21. for k, v in list(entities.items()):
  22. # skip multi-character entities
  23. if ((_is_ucs4 and len(v) > 1) or
  24. (not _is_ucs4 and len(v) > 2)):
  25. continue
  26. if v != "&":
  27. if len(v) == 2:
  28. v = _utils.surrogatePairToCodepoint(v)
  29. else:
  30. v = ord(v)
  31. if v not in _encode_entity_map or k.islower():
  32. # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
  33. _encode_entity_map[v] = k
  34. def htmlentityreplace_errors(exc):
  35. if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
  36. res = []
  37. codepoints = []
  38. skip = False
  39. for i, c in enumerate(exc.object[exc.start:exc.end]):
  40. if skip:
  41. skip = False
  42. continue
  43. index = i + exc.start
  44. if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
  45. codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
  46. skip = True
  47. else:
  48. codepoint = ord(c)
  49. codepoints.append(codepoint)
  50. for cp in codepoints:
  51. e = _encode_entity_map.get(cp)
  52. if e:
  53. res.append("&")
  54. res.append(e)
  55. if not e.endswith(";"):
  56. res.append(";")
  57. else:
  58. res.append("&#x%s;" % (hex(cp)[2:]))
  59. return ("".join(res), exc.end)
  60. else:
  61. return xmlcharrefreplace_errors(exc)
  62. register_error("htmlentityreplace", htmlentityreplace_errors)
  63. def serialize(input, tree="etree", encoding=None, **serializer_opts):
  64. """Serializes the input token stream using the specified treewalker
  65. :arg input: the token stream to serialize
  66. :arg tree: the treewalker to use
  67. :arg encoding: the encoding to use
  68. :arg serializer_opts: any options to pass to the
  69. :py:class:`html5lib.serializer.HTMLSerializer` that gets created
  70. :returns: the tree serialized as a string
  71. Example:
  72. >>> from html5lib.html5parser import parse
  73. >>> from html5lib.serializer import serialize
  74. >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')
  75. >>> serialize(token_stream, omit_optional_tags=False)
  76. '<html><head></head><body><p>Hi!</p></body></html>'
  77. """
  78. # XXX: Should we cache this?
  79. walker = treewalkers.getTreeWalker(tree)
  80. s = HTMLSerializer(**serializer_opts)
  81. return s.render(walker(input), encoding)
  82. class HTMLSerializer(object):
  83. # attribute quoting options
  84. quote_attr_values = "legacy" # be secure by default
  85. quote_char = '"'
  86. use_best_quote_char = True
  87. # tag syntax options
  88. omit_optional_tags = True
  89. minimize_boolean_attributes = True
  90. use_trailing_solidus = False
  91. space_before_trailing_solidus = True
  92. # escaping options
  93. escape_lt_in_attrs = False
  94. escape_rcdata = False
  95. resolve_entities = True
  96. # miscellaneous options
  97. alphabetical_attributes = False
  98. inject_meta_charset = True
  99. strip_whitespace = False
  100. sanitize = False
  101. options = ("quote_attr_values", "quote_char", "use_best_quote_char",
  102. "omit_optional_tags", "minimize_boolean_attributes",
  103. "use_trailing_solidus", "space_before_trailing_solidus",
  104. "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
  105. "alphabetical_attributes", "inject_meta_charset",
  106. "strip_whitespace", "sanitize")
  107. def __init__(self, **kwargs):
  108. """Initialize HTMLSerializer
  109. :arg inject_meta_charset: Whether or not to inject the meta charset.
  110. Defaults to ``True``.
  111. :arg quote_attr_values: Whether to quote attribute values that don't
  112. require quoting per legacy browser behavior (``"legacy"``), when
  113. required by the standard (``"spec"``), or always (``"always"``).
  114. Defaults to ``"legacy"``.
  115. :arg quote_char: Use given quote character for attribute quoting.
  116. Defaults to ``"`` which will use double quotes unless attribute
  117. value contains a double quote, in which case single quotes are
  118. used.
  119. :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute
  120. values.
  121. Defaults to ``False``.
  122. :arg escape_rcdata: Whether to escape characters that need to be
  123. escaped within normal elements within rcdata elements such as
  124. style.
  125. Defaults to ``False``.
  126. :arg resolve_entities: Whether to resolve named character entities that
  127. appear in the source tree. The XML predefined entities &lt; &gt;
  128. &amp; &quot; &apos; are unaffected by this setting.
  129. Defaults to ``True``.
  130. :arg strip_whitespace: Whether to remove semantically meaningless
  131. whitespace. (This compresses all whitespace to a single space
  132. except within ``pre``.)
  133. Defaults to ``False``.
  134. :arg minimize_boolean_attributes: Shortens boolean attributes to give
  135. just the attribute value, for example::
  136. <input disabled="disabled">
  137. becomes::
  138. <input disabled>
  139. Defaults to ``True``.
  140. :arg use_trailing_solidus: Includes a close-tag slash at the end of the
  141. start tag of void elements (empty elements whose end tag is
  142. forbidden). E.g. ``<hr/>``.
  143. Defaults to ``False``.
  144. :arg space_before_trailing_solidus: Places a space immediately before
  145. the closing slash in a tag using a trailing solidus. E.g.
  146. ``<hr />``. Requires ``use_trailing_solidus=True``.
  147. Defaults to ``True``.
  148. :arg sanitize: Strip all unsafe or unknown constructs from output.
  149. See :py:class:`html5lib.filters.sanitizer.Filter`.
  150. Defaults to ``False``.
  151. :arg omit_optional_tags: Omit start/end tags that are optional.
  152. Defaults to ``True``.
  153. :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.
  154. Defaults to ``False``.
  155. """
  156. unexpected_args = frozenset(kwargs) - frozenset(self.options)
  157. if len(unexpected_args) > 0:
  158. raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
  159. if 'quote_char' in kwargs:
  160. self.use_best_quote_char = False
  161. for attr in self.options:
  162. setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
  163. self.errors = []
  164. self.strict = False
  165. def encode(self, string):
  166. assert(isinstance(string, text_type))
  167. if self.encoding:
  168. return string.encode(self.encoding, "htmlentityreplace")
  169. else:
  170. return string
  171. def encodeStrict(self, string):
  172. assert(isinstance(string, text_type))
  173. if self.encoding:
  174. return string.encode(self.encoding, "strict")
  175. else:
  176. return string
  177. def serialize(self, treewalker, encoding=None):
  178. # pylint:disable=too-many-nested-blocks
  179. self.encoding = encoding
  180. in_cdata = False
  181. self.errors = []
  182. if encoding and self.inject_meta_charset:
  183. from .filters.inject_meta_charset import Filter
  184. treewalker = Filter(treewalker, encoding)
  185. # Alphabetical attributes is here under the assumption that none of
  186. # the later filters add or change order of attributes; it needs to be
  187. # before the sanitizer so escaped elements come out correctly
  188. if self.alphabetical_attributes:
  189. from .filters.alphabeticalattributes import Filter
  190. treewalker = Filter(treewalker)
  191. # WhitespaceFilter should be used before OptionalTagFilter
  192. # for maximum efficiently of this latter filter
  193. if self.strip_whitespace:
  194. from .filters.whitespace import Filter
  195. treewalker = Filter(treewalker)
  196. if self.sanitize:
  197. from .filters.sanitizer import Filter
  198. treewalker = Filter(treewalker)
  199. if self.omit_optional_tags:
  200. from .filters.optionaltags import Filter
  201. treewalker = Filter(treewalker)
  202. for token in treewalker:
  203. type = token["type"]
  204. if type == "Doctype":
  205. doctype = "<!DOCTYPE %s" % token["name"]
  206. if token["publicId"]:
  207. doctype += ' PUBLIC "%s"' % token["publicId"]
  208. elif token["systemId"]:
  209. doctype += " SYSTEM"
  210. if token["systemId"]:
  211. if token["systemId"].find('"') >= 0:
  212. if token["systemId"].find("'") >= 0:
  213. self.serializeError("System identifer contains both single and double quote characters")
  214. quote_char = "'"
  215. else:
  216. quote_char = '"'
  217. doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
  218. doctype += ">"
  219. yield self.encodeStrict(doctype)
  220. elif type in ("Characters", "SpaceCharacters"):
  221. if type == "SpaceCharacters" or in_cdata:
  222. if in_cdata and token["data"].find("</") >= 0:
  223. self.serializeError("Unexpected </ in CDATA")
  224. yield self.encode(token["data"])
  225. else:
  226. yield self.encode(escape(token["data"]))
  227. elif type in ("StartTag", "EmptyTag"):
  228. name = token["name"]
  229. yield self.encodeStrict("<%s" % name)
  230. if name in rcdataElements and not self.escape_rcdata:
  231. in_cdata = True
  232. elif in_cdata:
  233. self.serializeError("Unexpected child element of a CDATA element")
  234. for (_, attr_name), attr_value in token["data"].items():
  235. # TODO: Add namespace support here
  236. k = attr_name
  237. v = attr_value
  238. yield self.encodeStrict(' ')
  239. yield self.encodeStrict(k)
  240. if not self.minimize_boolean_attributes or \
  241. (k not in booleanAttributes.get(name, tuple()) and
  242. k not in booleanAttributes.get("", tuple())):
  243. yield self.encodeStrict("=")
  244. if self.quote_attr_values == "always" or len(v) == 0:
  245. quote_attr = True
  246. elif self.quote_attr_values == "spec":
  247. quote_attr = _quoteAttributeSpec.search(v) is not None
  248. elif self.quote_attr_values == "legacy":
  249. quote_attr = _quoteAttributeLegacy.search(v) is not None
  250. else:
  251. raise ValueError("quote_attr_values must be one of: "
  252. "'always', 'spec', or 'legacy'")
  253. v = v.replace("&", "&amp;")
  254. if self.escape_lt_in_attrs:
  255. v = v.replace("<", "&lt;")
  256. if quote_attr:
  257. quote_char = self.quote_char
  258. if self.use_best_quote_char:
  259. if "'" in v and '"' not in v:
  260. quote_char = '"'
  261. elif '"' in v and "'" not in v:
  262. quote_char = "'"
  263. if quote_char == "'":
  264. v = v.replace("'", "&#39;")
  265. else:
  266. v = v.replace('"', "&quot;")
  267. yield self.encodeStrict(quote_char)
  268. yield self.encode(v)
  269. yield self.encodeStrict(quote_char)
  270. else:
  271. yield self.encode(v)
  272. if name in voidElements and self.use_trailing_solidus:
  273. if self.space_before_trailing_solidus:
  274. yield self.encodeStrict(" /")
  275. else:
  276. yield self.encodeStrict("/")
  277. yield self.encode(">")
  278. elif type == "EndTag":
  279. name = token["name"]
  280. if name in rcdataElements:
  281. in_cdata = False
  282. elif in_cdata:
  283. self.serializeError("Unexpected child element of a CDATA element")
  284. yield self.encodeStrict("</%s>" % name)
  285. elif type == "Comment":
  286. data = token["data"]
  287. if data.find("--") >= 0:
  288. self.serializeError("Comment contains --")
  289. yield self.encodeStrict("<!--%s-->" % token["data"])
  290. elif type == "Entity":
  291. name = token["name"]
  292. key = name + ";"
  293. if key not in entities:
  294. self.serializeError("Entity %s not recognized" % name)
  295. if self.resolve_entities and key not in xmlEntities:
  296. data = entities[key]
  297. else:
  298. data = "&%s;" % name
  299. yield self.encodeStrict(data)
  300. else:
  301. self.serializeError(token["data"])
  302. def render(self, treewalker, encoding=None):
  303. """Serializes the stream from the treewalker into a string
  304. :arg treewalker: the treewalker to serialize
  305. :arg encoding: the string encoding to use
  306. :returns: the serialized tree
  307. Example:
  308. >>> from html5lib import parse, getTreeWalker
  309. >>> from html5lib.serializer import HTMLSerializer
  310. >>> token_stream = parse('<html><body>Hi!</body></html>')
  311. >>> walker = getTreeWalker('etree')
  312. >>> serializer = HTMLSerializer(omit_optional_tags=False)
  313. >>> serializer.render(walker(token_stream))
  314. '<html><head></head><body>Hi!</body></html>'
  315. """
  316. if encoding:
  317. return b"".join(list(self.serialize(treewalker, encoding)))
  318. else:
  319. return "".join(list(self.serialize(treewalker)))
  320. def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
  321. # XXX The idea is to make data mandatory.
  322. self.errors.append(data)
  323. if self.strict:
  324. raise SerializeError
  325. class SerializeError(Exception):
  326. """Error in serialized tree"""
  327. pass