You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

282 lines
9.6 KiB

4 years ago
  1. # markdown/searializers.py
  2. #
  3. # Add x/html serialization to Elementree
  4. # Taken from ElementTree 1.3 preview with slight modifications
  5. #
  6. # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
  7. #
  8. # fredrik@pythonware.com
  9. # http://www.pythonware.com
  10. #
  11. # --------------------------------------------------------------------
  12. # The ElementTree toolkit is
  13. #
  14. # Copyright (c) 1999-2007 by Fredrik Lundh
  15. #
  16. # By obtaining, using, and/or copying this software and/or its
  17. # associated documentation, you agree that you have read, understood,
  18. # and will comply with the following terms and conditions:
  19. #
  20. # Permission to use, copy, modify, and distribute this software and
  21. # its associated documentation for any purpose and without fee is
  22. # hereby granted, provided that the above copyright notice appears in
  23. # all copies, and that both that copyright notice and this permission
  24. # notice appear in supporting documentation, and that the name of
  25. # Secret Labs AB or the author not be used in advertising or publicity
  26. # pertaining to distribution of the software without specific, written
  27. # prior permission.
  28. #
  29. # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  30. # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  31. # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  32. # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  33. # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  34. # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  35. # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  36. # OF THIS SOFTWARE.
  37. # --------------------------------------------------------------------
  38. from __future__ import absolute_import
  39. from __future__ import unicode_literals
  40. from . import util
  41. ElementTree = util.etree.ElementTree
  42. QName = util.etree.QName
  43. if hasattr(util.etree, 'test_comment'): # pragma: no cover
  44. Comment = util.etree.test_comment
  45. else: # pragma: no cover
  46. Comment = util.etree.Comment
  47. PI = util.etree.PI
  48. ProcessingInstruction = util.etree.ProcessingInstruction
  49. __all__ = ['to_html_string', 'to_xhtml_string']
  50. HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
  51. "img", "input", "isindex", "link", "meta" "param")
  52. try:
  53. HTML_EMPTY = set(HTML_EMPTY)
  54. except NameError: # pragma: no cover
  55. pass
  56. _namespace_map = {
  57. # "well-known" namespace prefixes
  58. "http://www.w3.org/XML/1998/namespace": "xml",
  59. "http://www.w3.org/1999/xhtml": "html",
  60. "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
  61. "http://schemas.xmlsoap.org/wsdl/": "wsdl",
  62. # xml schema
  63. "http://www.w3.org/2001/XMLSchema": "xs",
  64. "http://www.w3.org/2001/XMLSchema-instance": "xsi",
  65. # dublic core
  66. "http://purl.org/dc/elements/1.1/": "dc",
  67. }
  68. def _raise_serialization_error(text): # pragma: no cover
  69. raise TypeError(
  70. "cannot serialize %r (type %s)" % (text, type(text).__name__)
  71. )
  72. def _encode(text, encoding):
  73. try:
  74. return text.encode(encoding, "xmlcharrefreplace")
  75. except (TypeError, AttributeError): # pragma: no cover
  76. _raise_serialization_error(text)
  77. def _escape_cdata(text):
  78. # escape character data
  79. try:
  80. # it's worth avoiding do-nothing calls for strings that are
  81. # shorter than 500 character, or so. assume that's, by far,
  82. # the most common case in most applications.
  83. if "&" in text:
  84. text = text.replace("&", "&")
  85. if "<" in text:
  86. text = text.replace("<", "&lt;")
  87. if ">" in text:
  88. text = text.replace(">", "&gt;")
  89. return text
  90. except (TypeError, AttributeError): # pragma: no cover
  91. _raise_serialization_error(text)
  92. def _escape_attrib(text):
  93. # escape attribute value
  94. try:
  95. if "&" in text:
  96. text = text.replace("&", "&amp;")
  97. if "<" in text:
  98. text = text.replace("<", "&lt;")
  99. if ">" in text:
  100. text = text.replace(">", "&gt;")
  101. if "\"" in text:
  102. text = text.replace("\"", "&quot;")
  103. if "\n" in text:
  104. text = text.replace("\n", "&#10;")
  105. return text
  106. except (TypeError, AttributeError): # pragma: no cover
  107. _raise_serialization_error(text)
  108. def _escape_attrib_html(text):
  109. # escape attribute value
  110. try:
  111. if "&" in text:
  112. text = text.replace("&", "&amp;")
  113. if "<" in text:
  114. text = text.replace("<", "&lt;")
  115. if ">" in text:
  116. text = text.replace(">", "&gt;")
  117. if "\"" in text:
  118. text = text.replace("\"", "&quot;")
  119. return text
  120. except (TypeError, AttributeError): # pragma: no cover
  121. _raise_serialization_error(text)
  122. def _serialize_html(write, elem, qnames, namespaces, format):
  123. tag = elem.tag
  124. text = elem.text
  125. if tag is Comment:
  126. write("<!--%s-->" % _escape_cdata(text))
  127. elif tag is ProcessingInstruction:
  128. write("<?%s?>" % _escape_cdata(text))
  129. else:
  130. tag = qnames[tag]
  131. if tag is None:
  132. if text:
  133. write(_escape_cdata(text))
  134. for e in elem:
  135. _serialize_html(write, e, qnames, None, format)
  136. else:
  137. write("<" + tag)
  138. items = elem.items()
  139. if items or namespaces:
  140. items = sorted(items) # lexical order
  141. for k, v in items:
  142. if isinstance(k, QName):
  143. k = k.text
  144. if isinstance(v, QName):
  145. v = qnames[v.text]
  146. else:
  147. v = _escape_attrib_html(v)
  148. if qnames[k] == v and format == 'html':
  149. # handle boolean attributes
  150. write(" %s" % v)
  151. else:
  152. write(" %s=\"%s\"" % (qnames[k], v))
  153. if namespaces:
  154. items = namespaces.items()
  155. items.sort(key=lambda x: x[1]) # sort on prefix
  156. for v, k in items:
  157. if k:
  158. k = ":" + k
  159. write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
  160. if format == "xhtml" and tag.lower() in HTML_EMPTY:
  161. write(" />")
  162. else:
  163. write(">")
  164. if text:
  165. if tag.lower() in ["script", "style"]:
  166. write(text)
  167. else:
  168. write(_escape_cdata(text))
  169. for e in elem:
  170. _serialize_html(write, e, qnames, None, format)
  171. if tag.lower() not in HTML_EMPTY:
  172. write("</" + tag + ">")
  173. if elem.tail:
  174. write(_escape_cdata(elem.tail))
  175. def _write_html(root,
  176. encoding=None,
  177. default_namespace=None,
  178. format="html"):
  179. assert root is not None
  180. data = []
  181. write = data.append
  182. qnames, namespaces = _namespaces(root, default_namespace)
  183. _serialize_html(write, root, qnames, namespaces, format)
  184. if encoding is None:
  185. return "".join(data)
  186. else:
  187. return _encode("".join(data))
  188. # --------------------------------------------------------------------
  189. # serialization support
  190. def _namespaces(elem, default_namespace=None):
  191. # identify namespaces used in this tree
  192. # maps qnames to *encoded* prefix:local names
  193. qnames = {None: None}
  194. # maps uri:s to prefixes
  195. namespaces = {}
  196. if default_namespace:
  197. namespaces[default_namespace] = ""
  198. def add_qname(qname):
  199. # calculate serialized qname representation
  200. try:
  201. if qname[:1] == "{":
  202. uri, tag = qname[1:].split("}", 1)
  203. prefix = namespaces.get(uri)
  204. if prefix is None:
  205. prefix = _namespace_map.get(uri)
  206. if prefix is None:
  207. prefix = "ns%d" % len(namespaces)
  208. if prefix != "xml":
  209. namespaces[uri] = prefix
  210. if prefix:
  211. qnames[qname] = "%s:%s" % (prefix, tag)
  212. else:
  213. qnames[qname] = tag # default element
  214. else:
  215. if default_namespace:
  216. raise ValueError(
  217. "cannot use non-qualified names with "
  218. "default_namespace option"
  219. )
  220. qnames[qname] = qname
  221. except TypeError: # pragma: no cover
  222. _raise_serialization_error(qname)
  223. # populate qname and namespaces table
  224. try:
  225. iterate = elem.iter
  226. except AttributeError:
  227. iterate = elem.getiterator # cET compatibility
  228. for elem in iterate():
  229. tag = elem.tag
  230. if isinstance(tag, QName) and tag.text not in qnames:
  231. add_qname(tag.text)
  232. elif isinstance(tag, util.string_type):
  233. if tag not in qnames:
  234. add_qname(tag)
  235. elif tag is not None and tag is not Comment and tag is not PI:
  236. _raise_serialization_error(tag)
  237. for key, value in elem.items():
  238. if isinstance(key, QName):
  239. key = key.text
  240. if key not in qnames:
  241. add_qname(key)
  242. if isinstance(value, QName) and value.text not in qnames:
  243. add_qname(value.text)
  244. text = elem.text
  245. if isinstance(text, QName) and text.text not in qnames:
  246. add_qname(text.text)
  247. return qnames, namespaces
  248. def to_html_string(element):
  249. return _write_html(ElementTree(element).getroot(), format="html")
  250. def to_xhtml_string(element):
  251. return _write_html(ElementTree(element).getroot(), format="xhtml")