|
|
- """A collection of modules for iterating through different kinds of
- tree, generating tokens identical to those produced by the tokenizer
- module.
-
- To create a tree walker for a new type of tree, you need to do
- implement a tree walker object (called TreeWalker by convention) that
- implements a 'serialize' method taking a tree as sole argument and
- returning an iterator generating tokens.
- """
-
- from __future__ import absolute_import, division, unicode_literals
-
- from .. import constants
- from .._utils import default_etree
-
- __all__ = ["getTreeWalker", "pprint"]
-
- treeWalkerCache = {}
-
-
- def getTreeWalker(treeType, implementation=None, **kwargs):
- """Get a TreeWalker class for various types of tree with built-in support
-
- :arg str treeType: the name of the tree type required (case-insensitive).
- Supported values are:
-
- * "dom": The xml.dom.minidom DOM implementation
- * "etree": A generic walker for tree implementations exposing an
- elementtree-like interface (known to work with ElementTree,
- cElementTree and lxml.etree).
- * "lxml": Optimized walker for lxml.etree
- * "genshi": a Genshi stream
-
- :arg implementation: A module implementing the tree type e.g.
- xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
- tree type only).
-
- :arg kwargs: keyword arguments passed to the etree walker--for other
- walkers, this has no effect
-
- :returns: a TreeWalker class
-
- """
-
- treeType = treeType.lower()
- if treeType not in treeWalkerCache:
- if treeType == "dom":
- from . import dom
- treeWalkerCache[treeType] = dom.TreeWalker
- elif treeType == "genshi":
- from . import genshi
- treeWalkerCache[treeType] = genshi.TreeWalker
- elif treeType == "lxml":
- from . import etree_lxml
- treeWalkerCache[treeType] = etree_lxml.TreeWalker
- elif treeType == "etree":
- from . import etree
- if implementation is None:
- implementation = default_etree
- # XXX: NEVER cache here, caching is done in the etree submodule
- return etree.getETreeModule(implementation, **kwargs).TreeWalker
- return treeWalkerCache.get(treeType)
-
-
- def concatenateCharacterTokens(tokens):
- pendingCharacters = []
- for token in tokens:
- type = token["type"]
- if type in ("Characters", "SpaceCharacters"):
- pendingCharacters.append(token["data"])
- else:
- if pendingCharacters:
- yield {"type": "Characters", "data": "".join(pendingCharacters)}
- pendingCharacters = []
- yield token
- if pendingCharacters:
- yield {"type": "Characters", "data": "".join(pendingCharacters)}
-
-
- def pprint(walker):
- """Pretty printer for tree walkers
-
- Takes a TreeWalker instance and pretty prints the output of walking the tree.
-
- :arg walker: a TreeWalker instance
-
- """
- output = []
- indent = 0
- for token in concatenateCharacterTokens(walker):
- type = token["type"]
- if type in ("StartTag", "EmptyTag"):
- # tag name
- if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
- if token["namespace"] in constants.prefixes:
- ns = constants.prefixes[token["namespace"]]
- else:
- ns = token["namespace"]
- name = "%s %s" % (ns, token["name"])
- else:
- name = token["name"]
- output.append("%s<%s>" % (" " * indent, name))
- indent += 2
- # attributes (sorted for consistent ordering)
- attrs = token["data"]
- for (namespace, localname), value in sorted(attrs.items()):
- if namespace:
- if namespace in constants.prefixes:
- ns = constants.prefixes[namespace]
- else:
- ns = namespace
- name = "%s %s" % (ns, localname)
- else:
- name = localname
- output.append("%s%s=\"%s\"" % (" " * indent, name, value))
- # self-closing
- if type == "EmptyTag":
- indent -= 2
-
- elif type == "EndTag":
- indent -= 2
-
- elif type == "Comment":
- output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
-
- elif type == "Doctype":
- if token["name"]:
- if token["publicId"]:
- output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
- (" " * indent,
- token["name"],
- token["publicId"],
- token["systemId"] if token["systemId"] else ""))
- elif token["systemId"]:
- output.append("""%s<!DOCTYPE %s "" "%s">""" %
- (" " * indent,
- token["name"],
- token["systemId"]))
- else:
- output.append("%s<!DOCTYPE %s>" % (" " * indent,
- token["name"]))
- else:
- output.append("%s<!DOCTYPE >" % (" " * indent,))
-
- elif type == "Characters":
- output.append("%s\"%s\"" % (" " * indent, token["data"]))
-
- elif type == "SpaceCharacters":
- assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
-
- else:
- raise ValueError("Unknown token type, %s" % type)
-
- return "\n".join(output)
|