|
|
- # markdown/searializers.py
- #
- # Add x/html serialization to Elementree
- # Taken from ElementTree 1.3 preview with slight modifications
- #
- # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
- #
- # fredrik@pythonware.com
- # http://www.pythonware.com
- #
- # --------------------------------------------------------------------
- # The ElementTree toolkit is
- #
- # Copyright (c) 1999-2007 by Fredrik Lundh
- #
- # By obtaining, using, and/or copying this software and/or its
- # associated documentation, you agree that you have read, understood,
- # and will comply with the following terms and conditions:
- #
- # Permission to use, copy, modify, and distribute this software and
- # its associated documentation for any purpose and without fee is
- # hereby granted, provided that the above copyright notice appears in
- # all copies, and that both that copyright notice and this permission
- # notice appear in supporting documentation, and that the name of
- # Secret Labs AB or the author not be used in advertising or publicity
- # pertaining to distribution of the software without specific, written
- # prior permission.
- #
- # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
- # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
- # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
- # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
- # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
- # OF THIS SOFTWARE.
- # --------------------------------------------------------------------
-
-
- from __future__ import absolute_import
- from __future__ import unicode_literals
- from . import util
- ElementTree = util.etree.ElementTree
- QName = util.etree.QName
- if hasattr(util.etree, 'test_comment'): # pragma: no cover
- Comment = util.etree.test_comment
- else: # pragma: no cover
- Comment = util.etree.Comment
- PI = util.etree.PI
- ProcessingInstruction = util.etree.ProcessingInstruction
-
- __all__ = ['to_html_string', 'to_xhtml_string']
-
- HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
- "img", "input", "isindex", "link", "meta" "param")
-
- try:
- HTML_EMPTY = set(HTML_EMPTY)
- except NameError: # pragma: no cover
- pass
-
- _namespace_map = {
- # "well-known" namespace prefixes
- "http://www.w3.org/XML/1998/namespace": "xml",
- "http://www.w3.org/1999/xhtml": "html",
- "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
- "http://schemas.xmlsoap.org/wsdl/": "wsdl",
- # xml schema
- "http://www.w3.org/2001/XMLSchema": "xs",
- "http://www.w3.org/2001/XMLSchema-instance": "xsi",
- # dublic core
- "http://purl.org/dc/elements/1.1/": "dc",
- }
-
-
- def _raise_serialization_error(text): # pragma: no cover
- raise TypeError(
- "cannot serialize %r (type %s)" % (text, type(text).__name__)
- )
-
-
- def _encode(text, encoding):
- try:
- return text.encode(encoding, "xmlcharrefreplace")
- except (TypeError, AttributeError): # pragma: no cover
- _raise_serialization_error(text)
-
-
- def _escape_cdata(text):
- # escape character data
- try:
- # it's worth avoiding do-nothing calls for strings that are
- # shorter than 500 character, or so. assume that's, by far,
- # the most common case in most applications.
- if "&" in text:
- text = text.replace("&", "&")
- if "<" in text:
- text = text.replace("<", "<")
- if ">" in text:
- text = text.replace(">", ">")
- return text
- except (TypeError, AttributeError): # pragma: no cover
- _raise_serialization_error(text)
-
-
- def _escape_attrib(text):
- # escape attribute value
- try:
- if "&" in text:
- text = text.replace("&", "&")
- if "<" in text:
- text = text.replace("<", "<")
- if ">" in text:
- text = text.replace(">", ">")
- if "\"" in text:
- text = text.replace("\"", """)
- if "\n" in text:
- text = text.replace("\n", " ")
- return text
- except (TypeError, AttributeError): # pragma: no cover
- _raise_serialization_error(text)
-
-
- def _escape_attrib_html(text):
- # escape attribute value
- try:
- if "&" in text:
- text = text.replace("&", "&")
- if "<" in text:
- text = text.replace("<", "<")
- if ">" in text:
- text = text.replace(">", ">")
- if "\"" in text:
- text = text.replace("\"", """)
- return text
- except (TypeError, AttributeError): # pragma: no cover
- _raise_serialization_error(text)
-
-
- def _serialize_html(write, elem, qnames, namespaces, format):
- tag = elem.tag
- text = elem.text
- if tag is Comment:
- write("<!--%s-->" % _escape_cdata(text))
- elif tag is ProcessingInstruction:
- write("<?%s?>" % _escape_cdata(text))
- else:
- tag = qnames[tag]
- if tag is None:
- if text:
- write(_escape_cdata(text))
- for e in elem:
- _serialize_html(write, e, qnames, None, format)
- else:
- write("<" + tag)
- items = elem.items()
- if items or namespaces:
- items = sorted(items) # lexical order
- for k, v in items:
- if isinstance(k, QName):
- k = k.text
- if isinstance(v, QName):
- v = qnames[v.text]
- else:
- v = _escape_attrib_html(v)
- if qnames[k] == v and format == 'html':
- # handle boolean attributes
- write(" %s" % v)
- else:
- write(" %s=\"%s\"" % (qnames[k], v))
- if namespaces:
- items = namespaces.items()
- items.sort(key=lambda x: x[1]) # sort on prefix
- for v, k in items:
- if k:
- k = ":" + k
- write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
- if format == "xhtml" and tag.lower() in HTML_EMPTY:
- write(" />")
- else:
- write(">")
- if text:
- if tag.lower() in ["script", "style"]:
- write(text)
- else:
- write(_escape_cdata(text))
- for e in elem:
- _serialize_html(write, e, qnames, None, format)
- if tag.lower() not in HTML_EMPTY:
- write("</" + tag + ">")
- if elem.tail:
- write(_escape_cdata(elem.tail))
-
-
- def _write_html(root,
- encoding=None,
- default_namespace=None,
- format="html"):
- assert root is not None
- data = []
- write = data.append
- qnames, namespaces = _namespaces(root, default_namespace)
- _serialize_html(write, root, qnames, namespaces, format)
- if encoding is None:
- return "".join(data)
- else:
- return _encode("".join(data))
-
-
- # --------------------------------------------------------------------
- # serialization support
-
- def _namespaces(elem, default_namespace=None):
- # identify namespaces used in this tree
-
- # maps qnames to *encoded* prefix:local names
- qnames = {None: None}
-
- # maps uri:s to prefixes
- namespaces = {}
- if default_namespace:
- namespaces[default_namespace] = ""
-
- def add_qname(qname):
- # calculate serialized qname representation
- try:
- if qname[:1] == "{":
- uri, tag = qname[1:].split("}", 1)
- prefix = namespaces.get(uri)
- if prefix is None:
- prefix = _namespace_map.get(uri)
- if prefix is None:
- prefix = "ns%d" % len(namespaces)
- if prefix != "xml":
- namespaces[uri] = prefix
- if prefix:
- qnames[qname] = "%s:%s" % (prefix, tag)
- else:
- qnames[qname] = tag # default element
- else:
- if default_namespace:
- raise ValueError(
- "cannot use non-qualified names with "
- "default_namespace option"
- )
- qnames[qname] = qname
- except TypeError: # pragma: no cover
- _raise_serialization_error(qname)
-
- # populate qname and namespaces table
- try:
- iterate = elem.iter
- except AttributeError:
- iterate = elem.getiterator # cET compatibility
- for elem in iterate():
- tag = elem.tag
- if isinstance(tag, QName) and tag.text not in qnames:
- add_qname(tag.text)
- elif isinstance(tag, util.string_type):
- if tag not in qnames:
- add_qname(tag)
- elif tag is not None and tag is not Comment and tag is not PI:
- _raise_serialization_error(tag)
- for key, value in elem.items():
- if isinstance(key, QName):
- key = key.text
- if key not in qnames:
- add_qname(key)
- if isinstance(value, QName) and value.text not in qnames:
- add_qname(value.text)
- text = elem.text
- if isinstance(text, QName) and text.text not in qnames:
- add_qname(text.text)
- return qnames, namespaces
-
-
- def to_html_string(element):
- return _write_html(ElementTree(element).getroot(), format="html")
-
-
- def to_xhtml_string(element):
- return _write_html(ElementTree(element).getroot(), format="xhtml")
|