You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

225 lines
6.8 KiB

4 years ago
  1. """Diagnostic functions, mainly for use when doing tech support."""
  2. # Use of this source code is governed by a BSD-style license that can be
  3. # found in the LICENSE file.
  4. __license__ = "MIT"
  5. import cProfile
  6. from io import StringIO
  7. from html.parser import HTMLParser
  8. import bs4
  9. from bs4 import BeautifulSoup, __version__
  10. from bs4.builder import builder_registry
  11. import os
  12. import pstats
  13. import random
  14. import tempfile
  15. import time
  16. import traceback
  17. import sys
  18. import cProfile
  19. def diagnose(data):
  20. """Diagnostic suite for isolating common problems."""
  21. print("Diagnostic running on Beautiful Soup %s" % __version__)
  22. print("Python version %s" % sys.version)
  23. basic_parsers = ["html.parser", "html5lib", "lxml"]
  24. for name in basic_parsers:
  25. for builder in builder_registry.builders:
  26. if name in builder.features:
  27. break
  28. else:
  29. basic_parsers.remove(name)
  30. print((
  31. "I noticed that %s is not installed. Installing it may help." %
  32. name))
  33. if 'lxml' in basic_parsers:
  34. basic_parsers.append("lxml-xml")
  35. try:
  36. from lxml import etree
  37. print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
  38. except ImportError as e:
  39. print (
  40. "lxml is not installed or couldn't be imported.")
  41. if 'html5lib' in basic_parsers:
  42. try:
  43. import html5lib
  44. print("Found html5lib version %s" % html5lib.__version__)
  45. except ImportError as e:
  46. print (
  47. "html5lib is not installed or couldn't be imported.")
  48. if hasattr(data, 'read'):
  49. data = data.read()
  50. elif data.startswith("http:") or data.startswith("https:"):
  51. print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
  52. print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
  53. return
  54. else:
  55. try:
  56. if os.path.exists(data):
  57. print('"%s" looks like a filename. Reading data from the file.' % data)
  58. with open(data) as fp:
  59. data = fp.read()
  60. except ValueError:
  61. # This can happen on some platforms when the 'filename' is
  62. # too long. Assume it's data and not a filename.
  63. pass
  64. print()
  65. for parser in basic_parsers:
  66. print("Trying to parse your markup with %s" % parser)
  67. success = False
  68. try:
  69. soup = BeautifulSoup(data, features=parser)
  70. success = True
  71. except Exception as e:
  72. print("%s could not parse the markup." % parser)
  73. traceback.print_exc()
  74. if success:
  75. print("Here's what %s did with the markup:" % parser)
  76. print(soup.prettify())
  77. print("-" * 80)
  78. def lxml_trace(data, html=True, **kwargs):
  79. """Print out the lxml events that occur during parsing.
  80. This lets you see how lxml parses a document when no Beautiful
  81. Soup code is running.
  82. """
  83. from lxml import etree
  84. for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
  85. print(("%s, %4s, %s" % (event, element.tag, element.text)))
  86. class AnnouncingParser(HTMLParser):
  87. """Announces HTMLParser parse events, without doing anything else."""
  88. def _p(self, s):
  89. print(s)
  90. def handle_starttag(self, name, attrs):
  91. self._p("%s START" % name)
  92. def handle_endtag(self, name):
  93. self._p("%s END" % name)
  94. def handle_data(self, data):
  95. self._p("%s DATA" % data)
  96. def handle_charref(self, name):
  97. self._p("%s CHARREF" % name)
  98. def handle_entityref(self, name):
  99. self._p("%s ENTITYREF" % name)
  100. def handle_comment(self, data):
  101. self._p("%s COMMENT" % data)
  102. def handle_decl(self, data):
  103. self._p("%s DECL" % data)
  104. def unknown_decl(self, data):
  105. self._p("%s UNKNOWN-DECL" % data)
  106. def handle_pi(self, data):
  107. self._p("%s PI" % data)
  108. def htmlparser_trace(data):
  109. """Print out the HTMLParser events that occur during parsing.
  110. This lets you see how HTMLParser parses a document when no
  111. Beautiful Soup code is running.
  112. """
  113. parser = AnnouncingParser()
  114. parser.feed(data)
  115. _vowels = "aeiou"
  116. _consonants = "bcdfghjklmnpqrstvwxyz"
  117. def rword(length=5):
  118. "Generate a random word-like string."
  119. s = ''
  120. for i in range(length):
  121. if i % 2 == 0:
  122. t = _consonants
  123. else:
  124. t = _vowels
  125. s += random.choice(t)
  126. return s
  127. def rsentence(length=4):
  128. "Generate a random sentence-like string."
  129. return " ".join(rword(random.randint(4,9)) for i in range(length))
  130. def rdoc(num_elements=1000):
  131. """Randomly generate an invalid HTML document."""
  132. tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
  133. elements = []
  134. for i in range(num_elements):
  135. choice = random.randint(0,3)
  136. if choice == 0:
  137. # New tag.
  138. tag_name = random.choice(tag_names)
  139. elements.append("<%s>" % tag_name)
  140. elif choice == 1:
  141. elements.append(rsentence(random.randint(1,4)))
  142. elif choice == 2:
  143. # Close a tag.
  144. tag_name = random.choice(tag_names)
  145. elements.append("</%s>" % tag_name)
  146. return "<html>" + "\n".join(elements) + "</html>"
  147. def benchmark_parsers(num_elements=100000):
  148. """Very basic head-to-head performance benchmark."""
  149. print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
  150. data = rdoc(num_elements)
  151. print("Generated a large invalid HTML document (%d bytes)." % len(data))
  152. for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
  153. success = False
  154. try:
  155. a = time.time()
  156. soup = BeautifulSoup(data, parser)
  157. b = time.time()
  158. success = True
  159. except Exception as e:
  160. print("%s could not parse the markup." % parser)
  161. traceback.print_exc()
  162. if success:
  163. print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
  164. from lxml import etree
  165. a = time.time()
  166. etree.HTML(data)
  167. b = time.time()
  168. print("Raw lxml parsed the markup in %.2fs." % (b-a))
  169. import html5lib
  170. parser = html5lib.HTMLParser()
  171. a = time.time()
  172. parser.parse(data)
  173. b = time.time()
  174. print("Raw html5lib parsed the markup in %.2fs." % (b-a))
  175. def profile(num_elements=100000, parser="lxml"):
  176. filehandle = tempfile.NamedTemporaryFile()
  177. filename = filehandle.name
  178. data = rdoc(num_elements)
  179. vars = dict(bs4=bs4, data=data, parser=parser)
  180. cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
  181. stats = pstats.Stats(filename)
  182. # stats.strip_dirs()
  183. stats.sort_stats("cumulative")
  184. stats.print_stats('_html5lib|bs4', 50)
  185. if __name__ == '__main__':
  186. diagnose(sys.stdin.read())