alpcentaur
/
brieftaube

"""A collection of modules for iterating through different kinds of
tree, generating tokens identical to those produced by the tokenizermodule.
To create a tree walker for a new type of tree, you need to doimplement a tree walker object (called TreeWalker by convention) thatimplements a 'serialize' method taking a tree as sole argument andreturning an iterator generating tokens."""

from __future__ import absolute_import, division, unicode_literals
from .. import constantsfrom .._utils import default_etree
__all__ = ["getTreeWalker", "pprint"]
treeWalkerCache = {}

def getTreeWalker(treeType, implementation=None, **kwargs):    """Get a TreeWalker class for various types of tree with built-in support

    :arg str treeType: the name of the tree type required (case-insensitive).        Supported values are:
        * "dom": The xml.dom.minidom DOM implementation        * "etree": A generic walker for tree implementations exposing an          elementtree-like interface (known to work with ElementTree,          cElementTree and lxml.etree).        * "lxml": Optimized walker for lxml.etree        * "genshi": a Genshi stream
    :arg implementation: A module implementing the tree type e.g.        xml.etree.ElementTree or cElementTree (Currently applies to the "etree"        tree type only).
    :arg kwargs: keyword arguments passed to the etree walker--for other        walkers, this has no effect
    :returns: a TreeWalker class
    """

    treeType = treeType.lower()    if treeType not in treeWalkerCache:        if treeType == "dom":            from . import dom            treeWalkerCache[treeType] = dom.TreeWalker        elif treeType == "genshi":            from . import genshi            treeWalkerCache[treeType] = genshi.TreeWalker        elif treeType == "lxml":            from . import etree_lxml            treeWalkerCache[treeType] = etree_lxml.TreeWalker        elif treeType == "etree":            from . import etree            if implementation is None:                implementation = default_etree            # XXX: NEVER cache here, caching is done in the etree submodule            return etree.getETreeModule(implementation, **kwargs).TreeWalker    return treeWalkerCache.get(treeType)

def concatenateCharacterTokens(tokens):    pendingCharacters = []    for token in tokens:        type = token["type"]        if type in ("Characters", "SpaceCharacters"):            pendingCharacters.append(token["data"])        else:            if pendingCharacters:                yield {"type": "Characters", "data": "".join(pendingCharacters)}                pendingCharacters = []            yield token    if pendingCharacters:        yield {"type": "Characters", "data": "".join(pendingCharacters)}

def pprint(walker):    """Pretty printer for tree walkers

    Takes a TreeWalker instance and pretty prints the output of walking the tree.
    :arg walker: a TreeWalker instance
    """
    output = []    indent = 0    for token in concatenateCharacterTokens(walker):        type = token["type"]        if type in ("StartTag", "EmptyTag"):            # tag name            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:                if token["namespace"] in constants.prefixes:                    ns = constants.prefixes[token["namespace"]]                else:                    ns = token["namespace"]                name = "%s %s" % (ns, token["name"])            else:                name = token["name"]            output.append("%s<%s>" % (" " * indent, name))            indent += 2            # attributes (sorted for consistent ordering)            attrs = token["data"]            for (namespace, localname), value in sorted(attrs.items()):                if namespace:                    if namespace in constants.prefixes:                        ns = constants.prefixes[namespace]                    else:                        ns = namespace                    name = "%s %s" % (ns, localname)                else:                    name = localname                output.append("%s%s=\"%s\"" % (" " * indent, name, value))            # self-closing            if type == "EmptyTag":                indent -= 2
        elif type == "EndTag":            indent -= 2
        elif type == "Comment":            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
        elif type == "Doctype":            if token["name"]:                if token["publicId"]:                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %                                  (" " * indent,                                   token["name"],                                   token["publicId"],                                   token["systemId"] if token["systemId"] else ""))                elif token["systemId"]:                    output.append("""%s<!DOCTYPE %s "" "%s">""" %                                  (" " * indent,                                   token["name"],                                   token["systemId"]))                else:                    output.append("%s<!DOCTYPE %s>" % (" " * indent,                                                       token["name"]))            else:                output.append("%s<!DOCTYPE >" % (" " * indent,))
        elif type == "Characters":            output.append("%s\"%s\"" % (" " * indent, token["data"]))
        elif type == "SpaceCharacters":            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
        else:            raise ValueError("Unknown token type, %s" % type)
    return "\n".join(output)