alpcentaur
/
brieftaube

"""External interface to the BeautifulSoup HTML parser.
"""

__all__ = ["fromstring", "parse", "convert_tree"]
import refrom lxml import etree, html
try:    from bs4 import (        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,        Declaration, Doctype)    _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)except ImportError:    from BeautifulSoup import (        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,        Declaration)    _DECLARATION_OR_DOCTYPE = Declaration

def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):    """Parse a string of HTML data into an Element tree using the
    BeautifulSoup parser.
    Returns the root ``<html>`` Element of the tree.
    You can pass a different BeautifulSoup parser through the    `beautifulsoup` keyword, and a diffent Element factory function    through the `makeelement` keyword.  By default, the standard    ``BeautifulSoup`` class and the default factory of `lxml.html` are    used.    """
    return _parse(data, beautifulsoup, makeelement, **bsargs)

def parse(file, beautifulsoup=None, makeelement=None, **bsargs):    """Parse a file into an ElemenTree using the BeautifulSoup parser.

    You can pass a different BeautifulSoup parser through the    `beautifulsoup` keyword, and a diffent Element factory function    through the `makeelement` keyword.  By default, the standard    ``BeautifulSoup`` class and the default factory of `lxml.html` are    used.    """
    if not hasattr(file, 'read'):        file = open(file)    root = _parse(file, beautifulsoup, makeelement, **bsargs)    return etree.ElementTree(root)

def convert_tree(beautiful_soup_tree, makeelement=None):    """Convert a BeautifulSoup tree to a list of Element trees.

    Returns a list instead of a single root Element to support    HTML-like soup with more than one root element.
    You can pass a different Element factory through the `makeelement`    keyword.    """
    root = _convert_tree(beautiful_soup_tree, makeelement)    children = root.getchildren()    for child in children:        root.remove(child)    return children

# helpers
def _parse(source, beautifulsoup, makeelement, **bsargs):    if beautifulsoup is None:        beautifulsoup = BeautifulSoup    if hasattr(beautifulsoup, "HTML_ENTITIES"):  # bs3        if 'convertEntities' not in bsargs:            bsargs['convertEntities'] = 'html'    if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"):  # bs4        if 'features' not in bsargs:            bsargs['features'] = 'html.parser'  # use Python html parser    tree = beautifulsoup(source, **bsargs)    root = _convert_tree(tree, makeelement)    # from ET: wrap the document in a html root element, if necessary    if len(root) == 1 and root[0].tag == "html":        return root[0]    root.tag = "html"    return root

_parse_doctype_declaration = re.compile(    r'(?:\s|[<!])*DOCTYPE\s*HTML'    r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'    r'(?:\s+(\'[^\']*\'|"[^"]*"))?',    re.IGNORECASE).match

class _PseudoTag:    # Minimal imitation of BeautifulSoup.Tag    def __init__(self, contents):        self.name = 'html'        self.attrs = []        self.contents = contents
    def __iter__(self):        return self.contents.__iter__()

def _convert_tree(beautiful_soup_tree, makeelement):    if makeelement is None:        makeelement = html.html_parser.makeelement
    # Split the tree into three parts:    # i) everything before the root element: document type    # declaration, comments, processing instructions, whitespace    # ii) the root(s),    # iii) everything after the root: comments, processing    # instructions, whitespace    first_element_idx = last_element_idx = None    html_root = declaration = None    for i, e in enumerate(beautiful_soup_tree):        if isinstance(e, Tag):            if first_element_idx is None:                first_element_idx = i            last_element_idx = i            if html_root is None and e.name and e.name.lower() == 'html':                html_root = e        elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):            declaration = e
    # For a nice, well-formatted document, the variable roots below is    # a list consisting of a single <html> element. However, the document    # may be a soup like '<meta><head><title>Hello</head><body>Hi    # all<\p>'. In this example roots is a list containing meta, head    # and body elements.    if first_element_idx is None:        pre_root = post_root = []        roots = beautiful_soup_tree.contents    else:        pre_root = beautiful_soup_tree.contents[:first_element_idx]        roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]        post_root = beautiful_soup_tree.contents[last_element_idx+1:]
    # Reorganize so that there is one <html> root...    if html_root is not None:        # ... use existing one if possible, ...        i = roots.index(html_root)        html_root.contents = roots[:i] + html_root.contents + roots[i+1:]    else:        # ... otherwise create a new one.        html_root = _PseudoTag(roots)
    convert_node = _init_node_converters(makeelement)
    # Process pre_root    res_root = convert_node(html_root)    prev = res_root    for e in reversed(pre_root):        converted = convert_node(e)        if converted is not None:            prev.addprevious(converted)            prev = converted
    # ditto for post_root    prev = res_root    for e in post_root:        converted = convert_node(e)        if converted is not None:            prev.addnext(converted)            prev = converted
    if declaration is not None:        try:            # bs4 provides full Doctype string            doctype_string = declaration.output_ready()        except AttributeError:            doctype_string = declaration.string
        match = _parse_doctype_declaration(doctype_string)        if not match:            # Something is wrong if we end up in here. Since soupparser should            # tolerate errors, do not raise Exception, just let it pass.            pass        else:            external_id, sys_uri = match.groups()            docinfo = res_root.getroottree().docinfo            # strip quotes and update DOCTYPE values (any of None, '', '...')            docinfo.public_id = external_id and external_id[1:-1]            docinfo.system_url = sys_uri and sys_uri[1:-1]
    return res_root

def _init_node_converters(makeelement):    converters = {}    ordered_node_types = []
    def converter(*types):        def add(handler):            for t in types:                converters[t] = handler                ordered_node_types.append(t)            return handler        return add
    def find_best_converter(node):        for t in ordered_node_types:            if isinstance(node, t):                return converters[t]        return None
    def convert_node(bs_node, parent=None):        # duplicated in convert_tag() below        try:            handler = converters[type(bs_node)]        except KeyError:            handler = converters[type(bs_node)] = find_best_converter(bs_node)        if handler is None:            return None        return handler(bs_node, parent)
    def map_attrs(bs_attrs):        if isinstance(bs_attrs, dict):  # bs4            attribs = {}            for k, v in bs_attrs.items():                if isinstance(v, list):                    v = " ".join(v)                attribs[k] = unescape(v)        else:            attribs = dict((k, unescape(v)) for k, v in bs_attrs)        return attribs
    def append_text(parent, text):        if len(parent) == 0:            parent.text = (parent.text or '') + text        else:            parent[-1].tail = (parent[-1].tail or '') + text
    # converters are tried in order of their definition
    @converter(Tag, _PseudoTag)    def convert_tag(bs_node, parent):        attrs = bs_node.attrs        if parent is not None:            attribs = map_attrs(attrs) if attrs else None            res = etree.SubElement(parent, bs_node.name, attrib=attribs)        else:            attribs = map_attrs(attrs) if attrs else {}            res = makeelement(bs_node.name, attrib=attribs)
        for child in bs_node:            # avoid double recursion by inlining convert_node(), see above            try:                handler = converters[type(child)]            except KeyError:                pass            else:                if handler is not None:                    handler(child, res)                continue            convert_node(child, res)        return res
    @converter(Comment)    def convert_comment(bs_node, parent):        res = html.HtmlComment(bs_node)        if parent is not None:            parent.append(res)        return res
    @converter(ProcessingInstruction)    def convert_pi(bs_node, parent):        if bs_node.endswith('?'):            # The PI is of XML style (<?as df?>) but BeautifulSoup            # interpreted it as being SGML style (<?as df>). Fix.            bs_node = bs_node[:-1]        res = etree.ProcessingInstruction(*bs_node.split(' ', 1))        if parent is not None:            parent.append(res)        return res
    @converter(NavigableString)    def convert_text(bs_node, parent):        if parent is not None:            append_text(parent, unescape(bs_node))        return None
    return convert_node

# copied from ET's ElementSoup
try:    from html.entities import name2codepoint  # Python 3except ImportError:    from htmlentitydefs import name2codepoint

handle_entities = re.compile(r"&(\w+);").sub

try:    unichrexcept NameError:    # Python 3    unichr = chr

def unescape(string):    if not string:        return ''    # work around oddities in BeautifulSoup's entity handling    def unescape_entity(m):        try:            return unichr(name2codepoint[m.group(1)])        except KeyError:            return m.group(0)  # use as is    return handle_entities(unescape_entity, string)