alpcentaur
/
basabuuka_prototyp

"""Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by a BSD-style license that can be# found in the LICENSE file.__license__ = "MIT"
import cProfilefrom io import StringIOfrom html.parser import HTMLParserimport bs4from bs4 import BeautifulSoup, __version__from bs4.builder import builder_registry
import osimport pstatsimport randomimport tempfileimport timeimport tracebackimport sysimport cProfile
def diagnose(data):    """Diagnostic suite for isolating common problems."""    print("Diagnostic running on Beautiful Soup %s" % __version__)    print("Python version %s" % sys.version)
    basic_parsers = ["html.parser", "html5lib", "lxml"]    for name in basic_parsers:        for builder in builder_registry.builders:            if name in builder.features:                break        else:            basic_parsers.remove(name)            print((                "I noticed that %s is not installed. Installing it may help." %                name))
    if 'lxml' in basic_parsers:        basic_parsers.append("lxml-xml")        try:            from lxml import etree            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))        except ImportError as e:            print (                "lxml is not installed or couldn't be imported.")

    if 'html5lib' in basic_parsers:        try:            import html5lib            print("Found html5lib version %s" % html5lib.__version__)        except ImportError as e:            print (                "html5lib is not installed or couldn't be imported.")
    if hasattr(data, 'read'):        data = data.read()    elif data.startswith("http:") or data.startswith("https:"):        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")        return    else:        try:            if os.path.exists(data):                print('"%s" looks like a filename. Reading data from the file.' % data)                with open(data) as fp:                    data = fp.read()        except ValueError:            # This can happen on some platforms when the 'filename' is            # too long. Assume it's data and not a filename.            pass        print()
    for parser in basic_parsers:        print("Trying to parse your markup with %s" % parser)        success = False        try:            soup = BeautifulSoup(data, features=parser)            success = True        except Exception as e:            print("%s could not parse the markup." % parser)            traceback.print_exc()        if success:            print("Here's what %s did with the markup:" % parser)            print(soup.prettify())
        print("-" * 80)
def lxml_trace(data, html=True, **kwargs):    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful    Soup code is running.    """
    from lxml import etree    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):        print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):    """Announces HTMLParser parse events, without doing anything else."""
    def _p(self, s):        print(s)
    def handle_starttag(self, name, attrs):        self._p("%s START" % name)
    def handle_endtag(self, name):        self._p("%s END" % name)
    def handle_data(self, data):        self._p("%s DATA" % data)
    def handle_charref(self, name):        self._p("%s CHARREF" % name)
    def handle_entityref(self, name):        self._p("%s ENTITYREF" % name)
    def handle_comment(self, data):        self._p("%s COMMENT" % data)
    def handle_decl(self, data):        self._p("%s DECL" % data)
    def unknown_decl(self, data):        self._p("%s UNKNOWN-DECL" % data)
    def handle_pi(self, data):        self._p("%s PI" % data)
def htmlparser_trace(data):    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no    Beautiful Soup code is running.    """
    parser = AnnouncingParser()    parser.feed(data)
_vowels = "aeiou"_consonants = "bcdfghjklmnpqrstvwxyz"
def rword(length=5):    "Generate a random word-like string."    s = ''    for i in range(length):        if i % 2 == 0:            t = _consonants        else:            t = _vowels        s += random.choice(t)    return s
def rsentence(length=4):    "Generate a random sentence-like string."    return " ".join(rword(random.randint(4,9)) for i in range(length))        def rdoc(num_elements=1000):    """Randomly generate an invalid HTML document."""    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']    elements = []    for i in range(num_elements):        choice = random.randint(0,3)        if choice == 0:            # New tag.            tag_name = random.choice(tag_names)            elements.append("<%s>" % tag_name)        elif choice == 1:            elements.append(rsentence(random.randint(1,4)))        elif choice == 2:            # Close a tag.            tag_name = random.choice(tag_names)            elements.append("</%s>" % tag_name)    return "<html>" + "\n".join(elements) + "</html>"
def benchmark_parsers(num_elements=100000):    """Very basic head-to-head performance benchmark."""    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)    data = rdoc(num_elements)    print("Generated a large invalid HTML document (%d bytes)." % len(data))        for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:        success = False        try:            a = time.time()            soup = BeautifulSoup(data, parser)            b = time.time()            success = True        except Exception as e:            print("%s could not parse the markup." % parser)            traceback.print_exc()        if success:            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
    from lxml import etree    a = time.time()    etree.HTML(data)    b = time.time()    print("Raw lxml parsed the markup in %.2fs." % (b-a))
    import html5lib    parser = html5lib.HTMLParser()    a = time.time()    parser.parse(data)    b = time.time()    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"):
    filehandle = tempfile.NamedTemporaryFile()    filename = filehandle.name
    data = rdoc(num_elements)    vars = dict(bs4=bs4, data=data, parser=parser)    cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
    stats = pstats.Stats(filename)    # stats.strip_dirs()    stats.sort_stats("cumulative")    stats.print_stats('_html5lib|bs4', 50)
if __name__ == '__main__':    diagnose(sys.stdin.read())