from __future__ import absolute_import, division, unicode_literals
|
|
|
|
from . import base
|
|
|
|
|
|
class Filter(base.Filter):
|
|
"""Removes optional tags from the token stream"""
|
|
def slider(self):
|
|
previous1 = previous2 = None
|
|
for token in self.source:
|
|
if previous1 is not None:
|
|
yield previous2, previous1, token
|
|
previous2 = previous1
|
|
previous1 = token
|
|
if previous1 is not None:
|
|
yield previous2, previous1, None
|
|
|
|
def __iter__(self):
|
|
for previous, token, next in self.slider():
|
|
type = token["type"]
|
|
if type == "StartTag":
|
|
if (token["data"] or
|
|
not self.is_optional_start(token["name"], previous, next)):
|
|
yield token
|
|
elif type == "EndTag":
|
|
if not self.is_optional_end(token["name"], next):
|
|
yield token
|
|
else:
|
|
yield token
|
|
|
|
def is_optional_start(self, tagname, previous, next):
|
|
type = next and next["type"] or None
|
|
if tagname in 'html':
|
|
# An html element's start tag may be omitted if the first thing
|
|
# inside the html element is not a space character or a comment.
|
|
return type not in ("Comment", "SpaceCharacters")
|
|
elif tagname == 'head':
|
|
# A head element's start tag may be omitted if the first thing
|
|
# inside the head element is an element.
|
|
# XXX: we also omit the start tag if the head element is empty
|
|
if type in ("StartTag", "EmptyTag"):
|
|
return True
|
|
elif type == "EndTag":
|
|
return next["name"] == "head"
|
|
elif tagname == 'body':
|
|
# A body element's start tag may be omitted if the first thing
|
|
# inside the body element is not a space character or a comment,
|
|
# except if the first thing inside the body element is a script
|
|
# or style element and the node immediately preceding the body
|
|
# element is a head element whose end tag has been omitted.
|
|
if type in ("Comment", "SpaceCharacters"):
|
|
return False
|
|
elif type == "StartTag":
|
|
# XXX: we do not look at the preceding event, so we never omit
|
|
# the body element's start tag if it's followed by a script or
|
|
# a style element.
|
|
return next["name"] not in ('script', 'style')
|
|
else:
|
|
return True
|
|
elif tagname == 'colgroup':
|
|
# A colgroup element's start tag may be omitted if the first thing
|
|
# inside the colgroup element is a col element, and if the element
|
|
# is not immediately preceded by another colgroup element whose
|
|
# end tag has been omitted.
|
|
if type in ("StartTag", "EmptyTag"):
|
|
# XXX: we do not look at the preceding event, so instead we never
|
|
# omit the colgroup element's end tag when it is immediately
|
|
# followed by another colgroup element. See is_optional_end.
|
|
return next["name"] == "col"
|
|
else:
|
|
return False
|
|
elif tagname == 'tbody':
|
|
# A tbody element's start tag may be omitted if the first thing
|
|
# inside the tbody element is a tr element, and if the element is
|
|
# not immediately preceded by a tbody, thead, or tfoot element
|
|
# whose end tag has been omitted.
|
|
if type == "StartTag":
|
|
# omit the thead and tfoot elements' end tag when they are
|
|
# immediately followed by a tbody element. See is_optional_end.
|
|
if previous and previous['type'] == 'EndTag' and \
|
|
previous['name'] in ('tbody', 'thead', 'tfoot'):
|
|
return False
|
|
return next["name"] == 'tr'
|
|
else:
|
|
return False
|
|
return False
|
|
|
|
def is_optional_end(self, tagname, next):
|
|
type = next and next["type"] or None
|
|
if tagname in ('html', 'head', 'body'):
|
|
# An html element's end tag may be omitted if the html element
|
|
# is not immediately followed by a space character or a comment.
|
|
return type not in ("Comment", "SpaceCharacters")
|
|
elif tagname in ('li', 'optgroup', 'tr'):
|
|
# A li element's end tag may be omitted if the li element is
|
|
# immediately followed by another li element or if there is
|
|
# no more content in the parent element.
|
|
# An optgroup element's end tag may be omitted if the optgroup
|
|
# element is immediately followed by another optgroup element,
|
|
# or if there is no more content in the parent element.
|
|
# A tr element's end tag may be omitted if the tr element is
|
|
# immediately followed by another tr element, or if there is
|
|
# no more content in the parent element.
|
|
if type == "StartTag":
|
|
return next["name"] == tagname
|
|
else:
|
|
return type == "EndTag" or type is None
|
|
elif tagname in ('dt', 'dd'):
|
|
# A dt element's end tag may be omitted if the dt element is
|
|
# immediately followed by another dt element or a dd element.
|
|
# A dd element's end tag may be omitted if the dd element is
|
|
# immediately followed by another dd element or a dt element,
|
|
# or if there is no more content in the parent element.
|
|
if type == "StartTag":
|
|
return next["name"] in ('dt', 'dd')
|
|
elif tagname == 'dd':
|
|
return type == "EndTag" or type is None
|
|
else:
|
|
return False
|
|
elif tagname == 'p':
|
|
# A p element's end tag may be omitted if the p element is
|
|
# immediately followed by an address, article, aside,
|
|
# blockquote, datagrid, dialog, dir, div, dl, fieldset,
|
|
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
|
|
# nav, ol, p, pre, section, table, or ul, element, or if
|
|
# there is no more content in the parent element.
|
|
if type in ("StartTag", "EmptyTag"):
|
|
return next["name"] in ('address', 'article', 'aside',
|
|
'blockquote', 'datagrid', 'dialog',
|
|
'dir', 'div', 'dl', 'fieldset', 'footer',
|
|
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
'header', 'hr', 'menu', 'nav', 'ol',
|
|
'p', 'pre', 'section', 'table', 'ul')
|
|
else:
|
|
return type == "EndTag" or type is None
|
|
elif tagname == 'option':
|
|
# An option element's end tag may be omitted if the option
|
|
# element is immediately followed by another option element,
|
|
# or if it is immediately followed by an <code>optgroup</code>
|
|
# element, or if there is no more content in the parent
|
|
# element.
|
|
if type == "StartTag":
|
|
return next["name"] in ('option', 'optgroup')
|
|
else:
|
|
return type == "EndTag" or type is None
|
|
elif tagname in ('rt', 'rp'):
|
|
# An rt element's end tag may be omitted if the rt element is
|
|
# immediately followed by an rt or rp element, or if there is
|
|
# no more content in the parent element.
|
|
# An rp element's end tag may be omitted if the rp element is
|
|
# immediately followed by an rt or rp element, or if there is
|
|
# no more content in the parent element.
|
|
if type == "StartTag":
|
|
return next["name"] in ('rt', 'rp')
|
|
else:
|
|
return type == "EndTag" or type is None
|
|
elif tagname == 'colgroup':
|
|
# A colgroup element's end tag may be omitted if the colgroup
|
|
# element is not immediately followed by a space character or
|
|
# a comment.
|
|
if type in ("Comment", "SpaceCharacters"):
|
|
return False
|
|
elif type == "StartTag":
|
|
# XXX: we also look for an immediately following colgroup
|
|
# element. See is_optional_start.
|
|
return next["name"] != 'colgroup'
|
|
else:
|
|
return True
|
|
elif tagname in ('thead', 'tbody'):
|
|
# A thead element's end tag may be omitted if the thead element
|
|
# is immediately followed by a tbody or tfoot element.
|
|
# A tbody element's end tag may be omitted if the tbody element
|
|
# is immediately followed by a tbody or tfoot element, or if
|
|
# there is no more content in the parent element.
|
|
# A tfoot element's end tag may be omitted if the tfoot element
|
|
# is immediately followed by a tbody element, or if there is no
|
|
# more content in the parent element.
|
|
# XXX: we never omit the end tag when the following element is
|
|
# a tbody. See is_optional_start.
|
|
if type == "StartTag":
|
|
return next["name"] in ['tbody', 'tfoot']
|
|
elif tagname == 'tbody':
|
|
return type == "EndTag" or type is None
|
|
else:
|
|
return False
|
|
elif tagname == 'tfoot':
|
|
# A tfoot element's end tag may be omitted if the tfoot element
|
|
# is immediately followed by a tbody element, or if there is no
|
|
# more content in the parent element.
|
|
# XXX: we never omit the end tag when the following element is
|
|
# a tbody. See is_optional_start.
|
|
if type == "StartTag":
|
|
return next["name"] == 'tbody'
|
|
else:
|
|
return type == "EndTag" or type is None
|
|
elif tagname in ('td', 'th'):
|
|
# A td element's end tag may be omitted if the td element is
|
|
# immediately followed by a td or th element, or if there is
|
|
# no more content in the parent element.
|
|
# A th element's end tag may be omitted if the th element is
|
|
# immediately followed by a td or th element, or if there is
|
|
# no more content in the parent element.
|
|
if type == "StartTag":
|
|
return next["name"] in ('td', 'th')
|
|
else:
|
|
return type == "EndTag" or type is None
|
|
return False
|