You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

260 lines
8.4 KiB

4 years ago
  1. """
  2. An interface to html5lib that mimics the lxml.html interface.
  3. """
  4. import sys
  5. import string
  6. from html5lib import HTMLParser as _HTMLParser
  7. from html5lib.treebuilders.etree_lxml import TreeBuilder
  8. from lxml import etree
  9. from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
  10. # python3 compatibility
  11. try:
  12. _strings = basestring
  13. except NameError:
  14. _strings = (bytes, str)
  15. try:
  16. from urllib2 import urlopen
  17. except ImportError:
  18. from urllib.request import urlopen
  19. try:
  20. from urlparse import urlparse
  21. except ImportError:
  22. from urllib.parse import urlparse
  23. class HTMLParser(_HTMLParser):
  24. """An html5lib HTML parser with lxml as tree."""
  25. def __init__(self, strict=False, **kwargs):
  26. _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
  27. try:
  28. from html5lib import XHTMLParser as _XHTMLParser
  29. except ImportError:
  30. pass
  31. else:
  32. class XHTMLParser(_XHTMLParser):
  33. """An html5lib XHTML Parser with lxml as tree."""
  34. def __init__(self, strict=False, **kwargs):
  35. _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
  36. xhtml_parser = XHTMLParser()
  37. def _find_tag(tree, tag):
  38. elem = tree.find(tag)
  39. if elem is not None:
  40. return elem
  41. return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
  42. def document_fromstring(html, guess_charset=None, parser=None):
  43. """
  44. Parse a whole document into a string.
  45. If `guess_charset` is true, or if the input is not Unicode but a
  46. byte string, the `chardet` library will perform charset guessing
  47. on the string.
  48. """
  49. if not isinstance(html, _strings):
  50. raise TypeError('string required')
  51. if parser is None:
  52. parser = html_parser
  53. options = {}
  54. if guess_charset is None and isinstance(html, bytes):
  55. # html5lib does not accept useChardet as an argument, if it
  56. # detected the html argument would produce unicode objects.
  57. guess_charset = True
  58. if guess_charset is not None:
  59. options['useChardet'] = guess_charset
  60. return parser.parse(html, **options).getroot()
  61. def fragments_fromstring(html, no_leading_text=False,
  62. guess_charset=None, parser=None):
  63. """Parses several HTML elements, returning a list of elements.
  64. The first item in the list may be a string. If no_leading_text is true,
  65. then it will be an error if there is leading text, and it will always be
  66. a list of only elements.
  67. If `guess_charset` is true, the `chardet` library will perform charset
  68. guessing on the string.
  69. """
  70. if not isinstance(html, _strings):
  71. raise TypeError('string required')
  72. if parser is None:
  73. parser = html_parser
  74. options = {}
  75. if guess_charset is None and isinstance(html, bytes):
  76. # html5lib does not accept useChardet as an argument, if it
  77. # detected the html argument would produce unicode objects.
  78. guess_charset = False
  79. if guess_charset is not None:
  80. options['useChardet'] = guess_charset
  81. children = parser.parseFragment(html, 'div', **options)
  82. if children and isinstance(children[0], _strings):
  83. if no_leading_text:
  84. if children[0].strip():
  85. raise etree.ParserError('There is leading text: %r' %
  86. children[0])
  87. del children[0]
  88. return children
  89. def fragment_fromstring(html, create_parent=False,
  90. guess_charset=None, parser=None):
  91. """Parses a single HTML element; it is an error if there is more than
  92. one element, or if anything but whitespace precedes or follows the
  93. element.
  94. If 'create_parent' is true (or is a tag name) then a parent node
  95. will be created to encapsulate the HTML in a single element. In
  96. this case, leading or trailing text is allowed.
  97. If `guess_charset` is true, the `chardet` library will perform charset
  98. guessing on the string.
  99. """
  100. if not isinstance(html, _strings):
  101. raise TypeError('string required')
  102. accept_leading_text = bool(create_parent)
  103. elements = fragments_fromstring(
  104. html, guess_charset=guess_charset, parser=parser,
  105. no_leading_text=not accept_leading_text)
  106. if create_parent:
  107. if not isinstance(create_parent, _strings):
  108. create_parent = 'div'
  109. new_root = Element(create_parent)
  110. if elements:
  111. if isinstance(elements[0], _strings):
  112. new_root.text = elements[0]
  113. del elements[0]
  114. new_root.extend(elements)
  115. return new_root
  116. if not elements:
  117. raise etree.ParserError('No elements found')
  118. if len(elements) > 1:
  119. raise etree.ParserError('Multiple elements found')
  120. result = elements[0]
  121. if result.tail and result.tail.strip():
  122. raise etree.ParserError('Element followed by text: %r' % result.tail)
  123. result.tail = None
  124. return result
  125. def fromstring(html, guess_charset=None, parser=None):
  126. """Parse the html, returning a single element/document.
  127. This tries to minimally parse the chunk of text, without knowing if it
  128. is a fragment or a document.
  129. 'base_url' will set the document's base_url attribute (and the tree's
  130. docinfo.URL)
  131. If `guess_charset` is true, or if the input is not Unicode but a
  132. byte string, the `chardet` library will perform charset guessing
  133. on the string.
  134. """
  135. if not isinstance(html, _strings):
  136. raise TypeError('string required')
  137. doc = document_fromstring(html, parser=parser,
  138. guess_charset=guess_charset)
  139. # document starts with doctype or <html>, full document!
  140. start = html[:50]
  141. if isinstance(start, bytes):
  142. # Allow text comparison in python3.
  143. # Decode as ascii, that also covers latin-1 and utf-8 for the
  144. # characters we need.
  145. start = start.decode('ascii', 'replace')
  146. start = start.lstrip().lower()
  147. if start.startswith('<html') or start.startswith('<!doctype'):
  148. return doc
  149. head = _find_tag(doc, 'head')
  150. # if the head is not empty we have a full document
  151. if len(head):
  152. return doc
  153. body = _find_tag(doc, 'body')
  154. # The body has just one element, so it was probably a single
  155. # element passed in
  156. if (len(body) == 1 and (not body.text or not body.text.strip())
  157. and (not body[-1].tail or not body[-1].tail.strip())):
  158. return body[0]
  159. # Now we have a body which represents a bunch of tags which have the
  160. # content that was passed in. We will create a fake container, which
  161. # is the body tag, except <body> implies too much structure.
  162. if _contains_block_level_tag(body):
  163. body.tag = 'div'
  164. else:
  165. body.tag = 'span'
  166. return body
  167. def parse(filename_url_or_file, guess_charset=None, parser=None):
  168. """Parse a filename, URL, or file-like object into an HTML document
  169. tree. Note: this returns a tree, not an element. Use
  170. ``parse(...).getroot()`` to get the document root.
  171. If ``guess_charset`` is true, the ``useChardet`` option is passed into
  172. html5lib to enable character detection. This option is on by default
  173. when parsing from URLs, off by default when parsing from file(-like)
  174. objects (which tend to return Unicode more often than not), and on by
  175. default when parsing from a file path (which is read in binary mode).
  176. """
  177. if parser is None:
  178. parser = html_parser
  179. if not isinstance(filename_url_or_file, _strings):
  180. fp = filename_url_or_file
  181. if guess_charset is None:
  182. # assume that file-like objects return Unicode more often than bytes
  183. guess_charset = False
  184. elif _looks_like_url(filename_url_or_file):
  185. fp = urlopen(filename_url_or_file)
  186. if guess_charset is None:
  187. # assume that URLs return bytes
  188. guess_charset = True
  189. else:
  190. fp = open(filename_url_or_file, 'rb')
  191. if guess_charset is None:
  192. guess_charset = True
  193. options = {}
  194. # html5lib does not accept useChardet as an argument, if it
  195. # detected the html argument would produce unicode objects.
  196. if guess_charset:
  197. options['useChardet'] = guess_charset
  198. return parser.parse(fp, **options)
  199. def _looks_like_url(str):
  200. scheme = urlparse(str)[0]
  201. if not scheme:
  202. return False
  203. elif (sys.platform == 'win32' and
  204. scheme in string.ascii_letters
  205. and len(scheme) == 1):
  206. # looks like a 'normal' absolute path
  207. return False
  208. else:
  209. return True
  210. html_parser = HTMLParser()