282 lines
9.6 KiB
Python
282 lines
9.6 KiB
Python
# markdown/searializers.py
|
|
#
|
|
# Add x/html serialization to Elementree
|
|
# Taken from ElementTree 1.3 preview with slight modifications
|
|
#
|
|
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
|
|
#
|
|
# fredrik@pythonware.com
|
|
# http://www.pythonware.com
|
|
#
|
|
# --------------------------------------------------------------------
|
|
# The ElementTree toolkit is
|
|
#
|
|
# Copyright (c) 1999-2007 by Fredrik Lundh
|
|
#
|
|
# By obtaining, using, and/or copying this software and/or its
|
|
# associated documentation, you agree that you have read, understood,
|
|
# and will comply with the following terms and conditions:
|
|
#
|
|
# Permission to use, copy, modify, and distribute this software and
|
|
# its associated documentation for any purpose and without fee is
|
|
# hereby granted, provided that the above copyright notice appears in
|
|
# all copies, and that both that copyright notice and this permission
|
|
# notice appear in supporting documentation, and that the name of
|
|
# Secret Labs AB or the author not be used in advertising or publicity
|
|
# pertaining to distribution of the software without specific, written
|
|
# prior permission.
|
|
#
|
|
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
|
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
|
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
|
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
|
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
|
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
|
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
|
# OF THIS SOFTWARE.
|
|
# --------------------------------------------------------------------
|
|
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import unicode_literals
|
|
from . import util
|
|
ElementTree = util.etree.ElementTree
|
|
QName = util.etree.QName
|
|
if hasattr(util.etree, 'test_comment'): # pragma: no cover
|
|
Comment = util.etree.test_comment
|
|
else: # pragma: no cover
|
|
Comment = util.etree.Comment
|
|
PI = util.etree.PI
|
|
ProcessingInstruction = util.etree.ProcessingInstruction
|
|
|
|
__all__ = ['to_html_string', 'to_xhtml_string']
|
|
|
|
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
|
|
"img", "input", "isindex", "link", "meta" "param")
|
|
|
|
try:
|
|
HTML_EMPTY = set(HTML_EMPTY)
|
|
except NameError: # pragma: no cover
|
|
pass
|
|
|
|
_namespace_map = {
|
|
# "well-known" namespace prefixes
|
|
"http://www.w3.org/XML/1998/namespace": "xml",
|
|
"http://www.w3.org/1999/xhtml": "html",
|
|
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
|
"http://schemas.xmlsoap.org/wsdl/": "wsdl",
|
|
# xml schema
|
|
"http://www.w3.org/2001/XMLSchema": "xs",
|
|
"http://www.w3.org/2001/XMLSchema-instance": "xsi",
|
|
# dublic core
|
|
"http://purl.org/dc/elements/1.1/": "dc",
|
|
}
|
|
|
|
|
|
def _raise_serialization_error(text): # pragma: no cover
|
|
raise TypeError(
|
|
"cannot serialize %r (type %s)" % (text, type(text).__name__)
|
|
)
|
|
|
|
|
|
def _encode(text, encoding):
|
|
try:
|
|
return text.encode(encoding, "xmlcharrefreplace")
|
|
except (TypeError, AttributeError): # pragma: no cover
|
|
_raise_serialization_error(text)
|
|
|
|
|
|
def _escape_cdata(text):
|
|
# escape character data
|
|
try:
|
|
# it's worth avoiding do-nothing calls for strings that are
|
|
# shorter than 500 character, or so. assume that's, by far,
|
|
# the most common case in most applications.
|
|
if "&" in text:
|
|
text = text.replace("&", "&")
|
|
if "<" in text:
|
|
text = text.replace("<", "<")
|
|
if ">" in text:
|
|
text = text.replace(">", ">")
|
|
return text
|
|
except (TypeError, AttributeError): # pragma: no cover
|
|
_raise_serialization_error(text)
|
|
|
|
|
|
def _escape_attrib(text):
|
|
# escape attribute value
|
|
try:
|
|
if "&" in text:
|
|
text = text.replace("&", "&")
|
|
if "<" in text:
|
|
text = text.replace("<", "<")
|
|
if ">" in text:
|
|
text = text.replace(">", ">")
|
|
if "\"" in text:
|
|
text = text.replace("\"", """)
|
|
if "\n" in text:
|
|
text = text.replace("\n", " ")
|
|
return text
|
|
except (TypeError, AttributeError): # pragma: no cover
|
|
_raise_serialization_error(text)
|
|
|
|
|
|
def _escape_attrib_html(text):
|
|
# escape attribute value
|
|
try:
|
|
if "&" in text:
|
|
text = text.replace("&", "&")
|
|
if "<" in text:
|
|
text = text.replace("<", "<")
|
|
if ">" in text:
|
|
text = text.replace(">", ">")
|
|
if "\"" in text:
|
|
text = text.replace("\"", """)
|
|
return text
|
|
except (TypeError, AttributeError): # pragma: no cover
|
|
_raise_serialization_error(text)
|
|
|
|
|
|
def _serialize_html(write, elem, qnames, namespaces, format):
|
|
tag = elem.tag
|
|
text = elem.text
|
|
if tag is Comment:
|
|
write("<!--%s-->" % _escape_cdata(text))
|
|
elif tag is ProcessingInstruction:
|
|
write("<?%s?>" % _escape_cdata(text))
|
|
else:
|
|
tag = qnames[tag]
|
|
if tag is None:
|
|
if text:
|
|
write(_escape_cdata(text))
|
|
for e in elem:
|
|
_serialize_html(write, e, qnames, None, format)
|
|
else:
|
|
write("<" + tag)
|
|
items = elem.items()
|
|
if items or namespaces:
|
|
items = sorted(items) # lexical order
|
|
for k, v in items:
|
|
if isinstance(k, QName):
|
|
k = k.text
|
|
if isinstance(v, QName):
|
|
v = qnames[v.text]
|
|
else:
|
|
v = _escape_attrib_html(v)
|
|
if qnames[k] == v and format == 'html':
|
|
# handle boolean attributes
|
|
write(" %s" % v)
|
|
else:
|
|
write(" %s=\"%s\"" % (qnames[k], v))
|
|
if namespaces:
|
|
items = namespaces.items()
|
|
items.sort(key=lambda x: x[1]) # sort on prefix
|
|
for v, k in items:
|
|
if k:
|
|
k = ":" + k
|
|
write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
|
|
if format == "xhtml" and tag.lower() in HTML_EMPTY:
|
|
write(" />")
|
|
else:
|
|
write(">")
|
|
if text:
|
|
if tag.lower() in ["script", "style"]:
|
|
write(text)
|
|
else:
|
|
write(_escape_cdata(text))
|
|
for e in elem:
|
|
_serialize_html(write, e, qnames, None, format)
|
|
if tag.lower() not in HTML_EMPTY:
|
|
write("</" + tag + ">")
|
|
if elem.tail:
|
|
write(_escape_cdata(elem.tail))
|
|
|
|
|
|
def _write_html(root,
|
|
encoding=None,
|
|
default_namespace=None,
|
|
format="html"):
|
|
assert root is not None
|
|
data = []
|
|
write = data.append
|
|
qnames, namespaces = _namespaces(root, default_namespace)
|
|
_serialize_html(write, root, qnames, namespaces, format)
|
|
if encoding is None:
|
|
return "".join(data)
|
|
else:
|
|
return _encode("".join(data))
|
|
|
|
|
|
# --------------------------------------------------------------------
|
|
# serialization support
|
|
|
|
def _namespaces(elem, default_namespace=None):
|
|
# identify namespaces used in this tree
|
|
|
|
# maps qnames to *encoded* prefix:local names
|
|
qnames = {None: None}
|
|
|
|
# maps uri:s to prefixes
|
|
namespaces = {}
|
|
if default_namespace:
|
|
namespaces[default_namespace] = ""
|
|
|
|
def add_qname(qname):
|
|
# calculate serialized qname representation
|
|
try:
|
|
if qname[:1] == "{":
|
|
uri, tag = qname[1:].split("}", 1)
|
|
prefix = namespaces.get(uri)
|
|
if prefix is None:
|
|
prefix = _namespace_map.get(uri)
|
|
if prefix is None:
|
|
prefix = "ns%d" % len(namespaces)
|
|
if prefix != "xml":
|
|
namespaces[uri] = prefix
|
|
if prefix:
|
|
qnames[qname] = "%s:%s" % (prefix, tag)
|
|
else:
|
|
qnames[qname] = tag # default element
|
|
else:
|
|
if default_namespace:
|
|
raise ValueError(
|
|
"cannot use non-qualified names with "
|
|
"default_namespace option"
|
|
)
|
|
qnames[qname] = qname
|
|
except TypeError: # pragma: no cover
|
|
_raise_serialization_error(qname)
|
|
|
|
# populate qname and namespaces table
|
|
try:
|
|
iterate = elem.iter
|
|
except AttributeError:
|
|
iterate = elem.getiterator # cET compatibility
|
|
for elem in iterate():
|
|
tag = elem.tag
|
|
if isinstance(tag, QName) and tag.text not in qnames:
|
|
add_qname(tag.text)
|
|
elif isinstance(tag, util.string_type):
|
|
if tag not in qnames:
|
|
add_qname(tag)
|
|
elif tag is not None and tag is not Comment and tag is not PI:
|
|
_raise_serialization_error(tag)
|
|
for key, value in elem.items():
|
|
if isinstance(key, QName):
|
|
key = key.text
|
|
if key not in qnames:
|
|
add_qname(key)
|
|
if isinstance(value, QName) and value.text not in qnames:
|
|
add_qname(value.text)
|
|
text = elem.text
|
|
if isinstance(text, QName) and text.text not in qnames:
|
|
add_qname(text.text)
|
|
return qnames, namespaces
|
|
|
|
|
|
def to_html_string(element):
|
|
return _write_html(ElementTree(element).getroot(), format="html")
|
|
|
|
|
|
def to_xhtml_string(element):
|
|
return _write_html(ElementTree(element).getroot(), format="xhtml")
|