|
|
- """
- An interface to html5lib that mimics the lxml.html interface.
- """
- import sys
- import string
-
- from html5lib import HTMLParser as _HTMLParser
- from html5lib.treebuilders.etree_lxml import TreeBuilder
- from lxml import etree
- from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
-
- # python3 compatibility
- try:
- _strings = basestring
- except NameError:
- _strings = (bytes, str)
- try:
- from urllib2 import urlopen
- except ImportError:
- from urllib.request import urlopen
- try:
- from urlparse import urlparse
- except ImportError:
- from urllib.parse import urlparse
-
-
- class HTMLParser(_HTMLParser):
- """An html5lib HTML parser with lxml as tree."""
-
- def __init__(self, strict=False, **kwargs):
- _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
-
-
- try:
- from html5lib import XHTMLParser as _XHTMLParser
- except ImportError:
- pass
- else:
- class XHTMLParser(_XHTMLParser):
- """An html5lib XHTML Parser with lxml as tree."""
-
- def __init__(self, strict=False, **kwargs):
- _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
-
- xhtml_parser = XHTMLParser()
-
-
- def _find_tag(tree, tag):
- elem = tree.find(tag)
- if elem is not None:
- return elem
- return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
-
-
- def document_fromstring(html, guess_charset=None, parser=None):
- """
- Parse a whole document into a string.
-
- If `guess_charset` is true, or if the input is not Unicode but a
- byte string, the `chardet` library will perform charset guessing
- on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
-
- if parser is None:
- parser = html_parser
-
- options = {}
- if guess_charset is None and isinstance(html, bytes):
- # html5lib does not accept useChardet as an argument, if it
- # detected the html argument would produce unicode objects.
- guess_charset = True
- if guess_charset is not None:
- options['useChardet'] = guess_charset
- return parser.parse(html, **options).getroot()
-
-
- def fragments_fromstring(html, no_leading_text=False,
- guess_charset=None, parser=None):
- """Parses several HTML elements, returning a list of elements.
-
- The first item in the list may be a string. If no_leading_text is true,
- then it will be an error if there is leading text, and it will always be
- a list of only elements.
-
- If `guess_charset` is true, the `chardet` library will perform charset
- guessing on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
-
- if parser is None:
- parser = html_parser
-
- options = {}
- if guess_charset is None and isinstance(html, bytes):
- # html5lib does not accept useChardet as an argument, if it
- # detected the html argument would produce unicode objects.
- guess_charset = False
- if guess_charset is not None:
- options['useChardet'] = guess_charset
- children = parser.parseFragment(html, 'div', **options)
- if children and isinstance(children[0], _strings):
- if no_leading_text:
- if children[0].strip():
- raise etree.ParserError('There is leading text: %r' %
- children[0])
- del children[0]
- return children
-
-
- def fragment_fromstring(html, create_parent=False,
- guess_charset=None, parser=None):
- """Parses a single HTML element; it is an error if there is more than
- one element, or if anything but whitespace precedes or follows the
- element.
-
- If 'create_parent' is true (or is a tag name) then a parent node
- will be created to encapsulate the HTML in a single element. In
- this case, leading or trailing text is allowed.
-
- If `guess_charset` is true, the `chardet` library will perform charset
- guessing on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
-
- accept_leading_text = bool(create_parent)
-
- elements = fragments_fromstring(
- html, guess_charset=guess_charset, parser=parser,
- no_leading_text=not accept_leading_text)
-
- if create_parent:
- if not isinstance(create_parent, _strings):
- create_parent = 'div'
- new_root = Element(create_parent)
- if elements:
- if isinstance(elements[0], _strings):
- new_root.text = elements[0]
- del elements[0]
- new_root.extend(elements)
- return new_root
-
- if not elements:
- raise etree.ParserError('No elements found')
- if len(elements) > 1:
- raise etree.ParserError('Multiple elements found')
- result = elements[0]
- if result.tail and result.tail.strip():
- raise etree.ParserError('Element followed by text: %r' % result.tail)
- result.tail = None
- return result
-
-
- def fromstring(html, guess_charset=None, parser=None):
- """Parse the html, returning a single element/document.
-
- This tries to minimally parse the chunk of text, without knowing if it
- is a fragment or a document.
-
- 'base_url' will set the document's base_url attribute (and the tree's
- docinfo.URL)
-
- If `guess_charset` is true, or if the input is not Unicode but a
- byte string, the `chardet` library will perform charset guessing
- on the string.
- """
- if not isinstance(html, _strings):
- raise TypeError('string required')
- doc = document_fromstring(html, parser=parser,
- guess_charset=guess_charset)
-
- # document starts with doctype or <html>, full document!
- start = html[:50]
- if isinstance(start, bytes):
- # Allow text comparison in python3.
- # Decode as ascii, that also covers latin-1 and utf-8 for the
- # characters we need.
- start = start.decode('ascii', 'replace')
-
- start = start.lstrip().lower()
- if start.startswith('<html') or start.startswith('<!doctype'):
- return doc
-
- head = _find_tag(doc, 'head')
-
- # if the head is not empty we have a full document
- if len(head):
- return doc
-
- body = _find_tag(doc, 'body')
-
- # The body has just one element, so it was probably a single
- # element passed in
- if (len(body) == 1 and (not body.text or not body.text.strip())
- and (not body[-1].tail or not body[-1].tail.strip())):
- return body[0]
-
- # Now we have a body which represents a bunch of tags which have the
- # content that was passed in. We will create a fake container, which
- # is the body tag, except <body> implies too much structure.
- if _contains_block_level_tag(body):
- body.tag = 'div'
- else:
- body.tag = 'span'
- return body
-
-
- def parse(filename_url_or_file, guess_charset=None, parser=None):
- """Parse a filename, URL, or file-like object into an HTML document
- tree. Note: this returns a tree, not an element. Use
- ``parse(...).getroot()`` to get the document root.
-
- If ``guess_charset`` is true, the ``useChardet`` option is passed into
- html5lib to enable character detection. This option is on by default
- when parsing from URLs, off by default when parsing from file(-like)
- objects (which tend to return Unicode more often than not), and on by
- default when parsing from a file path (which is read in binary mode).
- """
- if parser is None:
- parser = html_parser
- if not isinstance(filename_url_or_file, _strings):
- fp = filename_url_or_file
- if guess_charset is None:
- # assume that file-like objects return Unicode more often than bytes
- guess_charset = False
- elif _looks_like_url(filename_url_or_file):
- fp = urlopen(filename_url_or_file)
- if guess_charset is None:
- # assume that URLs return bytes
- guess_charset = True
- else:
- fp = open(filename_url_or_file, 'rb')
- if guess_charset is None:
- guess_charset = True
-
- options = {}
- # html5lib does not accept useChardet as an argument, if it
- # detected the html argument would produce unicode objects.
- if guess_charset:
- options['useChardet'] = guess_charset
- return parser.parse(fp, **options)
-
-
- def _looks_like_url(str):
- scheme = urlparse(str)[0]
- if not scheme:
- return False
- elif (sys.platform == 'win32' and
- scheme in string.ascii_letters
- and len(scheme) == 1):
- # looks like a 'normal' absolute path
- return False
- else:
- return True
-
-
- html_parser = HTMLParser()
|