You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2724 lines
115 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. from __future__ import absolute_import, division, unicode_literals
  2. from six import with_metaclass
  3. import types
  4. from . import inputstream
  5. from . import tokenizer
  6. from . import treebuilders
  7. from .treebuilders._base import Marker
  8. from . import utils
  9. from . import constants
  10. from .constants import spaceCharacters, asciiUpper2Lower
  11. from .constants import specialElements
  12. from .constants import headingElements
  13. from .constants import cdataElements, rcdataElements
  14. from .constants import tokenTypes, ReparseException, namespaces
  15. from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
  16. from .constants import adjustForeignAttributes as adjustForeignAttributesMap
  17. from .constants import E
  18. def parse(doc, treebuilder="etree", encoding=None,
  19. namespaceHTMLElements=True):
  20. """Parse a string or file-like object into a tree"""
  21. tb = treebuilders.getTreeBuilder(treebuilder)
  22. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  23. return p.parse(doc, encoding=encoding)
  24. def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
  25. namespaceHTMLElements=True):
  26. tb = treebuilders.getTreeBuilder(treebuilder)
  27. p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
  28. return p.parseFragment(doc, container=container, encoding=encoding)
  29. def method_decorator_metaclass(function):
  30. class Decorated(type):
  31. def __new__(meta, classname, bases, classDict):
  32. for attributeName, attribute in classDict.items():
  33. if isinstance(attribute, types.FunctionType):
  34. attribute = function(attribute)
  35. classDict[attributeName] = attribute
  36. return type.__new__(meta, classname, bases, classDict)
  37. return Decorated
  38. class HTMLParser(object):
  39. """HTML parser. Generates a tree structure from a stream of (possibly
  40. malformed) HTML"""
  41. def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
  42. strict=False, namespaceHTMLElements=True, debug=False):
  43. """
  44. strict - raise an exception when a parse error is encountered
  45. tree - a treebuilder class controlling the type of tree that will be
  46. returned. Built in treebuilders can be accessed through
  47. html5lib.treebuilders.getTreeBuilder(treeType)
  48. tokenizer - a class that provides a stream of tokens to the treebuilder.
  49. This may be replaced for e.g. a sanitizer which converts some tags to
  50. text
  51. """
  52. # Raise an exception on the first error encountered
  53. self.strict = strict
  54. if tree is None:
  55. tree = treebuilders.getTreeBuilder("etree")
  56. self.tree = tree(namespaceHTMLElements)
  57. self.tokenizer_class = tokenizer
  58. self.errors = []
  59. self.phases = dict([(name, cls(self, self.tree)) for name, cls in
  60. getPhases(debug).items()])
  61. def _parse(self, stream, innerHTML=False, container="div",
  62. encoding=None, parseMeta=True, useChardet=True, **kwargs):
  63. self.innerHTMLMode = innerHTML
  64. self.container = container
  65. self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
  66. parseMeta=parseMeta,
  67. useChardet=useChardet,
  68. parser=self, **kwargs)
  69. self.reset()
  70. while True:
  71. try:
  72. self.mainLoop()
  73. break
  74. except ReparseException:
  75. self.reset()
  76. def reset(self):
  77. self.tree.reset()
  78. self.firstStartTag = False
  79. self.errors = []
  80. self.log = [] # only used with debug mode
  81. # "quirks" / "limited quirks" / "no quirks"
  82. self.compatMode = "no quirks"
  83. if self.innerHTMLMode:
  84. self.innerHTML = self.container.lower()
  85. if self.innerHTML in cdataElements:
  86. self.tokenizer.state = self.tokenizer.rcdataState
  87. elif self.innerHTML in rcdataElements:
  88. self.tokenizer.state = self.tokenizer.rawtextState
  89. elif self.innerHTML == 'plaintext':
  90. self.tokenizer.state = self.tokenizer.plaintextState
  91. else:
  92. # state already is data state
  93. # self.tokenizer.state = self.tokenizer.dataState
  94. pass
  95. self.phase = self.phases["beforeHtml"]
  96. self.phase.insertHtmlElement()
  97. self.resetInsertionMode()
  98. else:
  99. self.innerHTML = False
  100. self.phase = self.phases["initial"]
  101. self.lastPhase = None
  102. self.beforeRCDataPhase = None
  103. self.framesetOK = True
  104. @property
  105. def documentEncoding(self):
  106. """The name of the character encoding
  107. that was used to decode the input stream,
  108. or :obj:`None` if that is not determined yet.
  109. """
  110. if not hasattr(self, 'tokenizer'):
  111. return None
  112. return self.tokenizer.stream.charEncoding[0]
  113. def isHTMLIntegrationPoint(self, element):
  114. if (element.name == "annotation-xml" and
  115. element.namespace == namespaces["mathml"]):
  116. return ("encoding" in element.attributes and
  117. element.attributes["encoding"].translate(
  118. asciiUpper2Lower) in
  119. ("text/html", "application/xhtml+xml"))
  120. else:
  121. return (element.namespace, element.name) in htmlIntegrationPointElements
  122. def isMathMLTextIntegrationPoint(self, element):
  123. return (element.namespace, element.name) in mathmlTextIntegrationPointElements
  124. def mainLoop(self):
  125. CharactersToken = tokenTypes["Characters"]
  126. SpaceCharactersToken = tokenTypes["SpaceCharacters"]
  127. StartTagToken = tokenTypes["StartTag"]
  128. EndTagToken = tokenTypes["EndTag"]
  129. CommentToken = tokenTypes["Comment"]
  130. DoctypeToken = tokenTypes["Doctype"]
  131. ParseErrorToken = tokenTypes["ParseError"]
  132. for token in self.normalizedTokens():
  133. new_token = token
  134. while new_token is not None:
  135. currentNode = self.tree.openElements[-1] if self.tree.openElements else None
  136. currentNodeNamespace = currentNode.namespace if currentNode else None
  137. currentNodeName = currentNode.name if currentNode else None
  138. type = new_token["type"]
  139. if type == ParseErrorToken:
  140. self.parseError(new_token["data"], new_token.get("datavars", {}))
  141. new_token = None
  142. else:
  143. if (len(self.tree.openElements) == 0 or
  144. currentNodeNamespace == self.tree.defaultNamespace or
  145. (self.isMathMLTextIntegrationPoint(currentNode) and
  146. ((type == StartTagToken and
  147. token["name"] not in frozenset(["mglyph", "malignmark"])) or
  148. type in (CharactersToken, SpaceCharactersToken))) or
  149. (currentNodeNamespace == namespaces["mathml"] and
  150. currentNodeName == "annotation-xml" and
  151. token["name"] == "svg") or
  152. (self.isHTMLIntegrationPoint(currentNode) and
  153. type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
  154. phase = self.phase
  155. else:
  156. phase = self.phases["inForeignContent"]
  157. if type == CharactersToken:
  158. new_token = phase.processCharacters(new_token)
  159. elif type == SpaceCharactersToken:
  160. new_token = phase.processSpaceCharacters(new_token)
  161. elif type == StartTagToken:
  162. new_token = phase.processStartTag(new_token)
  163. elif type == EndTagToken:
  164. new_token = phase.processEndTag(new_token)
  165. elif type == CommentToken:
  166. new_token = phase.processComment(new_token)
  167. elif type == DoctypeToken:
  168. new_token = phase.processDoctype(new_token)
  169. if (type == StartTagToken and token["selfClosing"]
  170. and not token["selfClosingAcknowledged"]):
  171. self.parseError("non-void-element-with-trailing-solidus",
  172. {"name": token["name"]})
  173. # When the loop finishes it's EOF
  174. reprocess = True
  175. phases = []
  176. while reprocess:
  177. phases.append(self.phase)
  178. reprocess = self.phase.processEOF()
  179. if reprocess:
  180. assert self.phase not in phases
  181. def normalizedTokens(self):
  182. for token in self.tokenizer:
  183. yield self.normalizeToken(token)
  184. def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
  185. """Parse a HTML document into a well-formed tree
  186. stream - a filelike object or string containing the HTML to be parsed
  187. The optional encoding parameter must be a string that indicates
  188. the encoding. If specified, that encoding will be used,
  189. regardless of any BOM or later declaration (such as in a meta
  190. element)
  191. """
  192. self._parse(stream, innerHTML=False, encoding=encoding,
  193. parseMeta=parseMeta, useChardet=useChardet)
  194. return self.tree.getDocument()
  195. def parseFragment(self, stream, container="div", encoding=None,
  196. parseMeta=False, useChardet=True):
  197. """Parse a HTML fragment into a well-formed tree fragment
  198. container - name of the element we're setting the innerHTML property
  199. if set to None, default to 'div'
  200. stream - a filelike object or string containing the HTML to be parsed
  201. The optional encoding parameter must be a string that indicates
  202. the encoding. If specified, that encoding will be used,
  203. regardless of any BOM or later declaration (such as in a meta
  204. element)
  205. """
  206. self._parse(stream, True, container=container, encoding=encoding)
  207. return self.tree.getFragment()
  208. def parseError(self, errorcode="XXX-undefined-error", datavars={}):
  209. # XXX The idea is to make errorcode mandatory.
  210. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
  211. if self.strict:
  212. raise ParseError(E[errorcode] % datavars)
  213. def normalizeToken(self, token):
  214. """ HTML5 specific normalizations to the token stream """
  215. if token["type"] == tokenTypes["StartTag"]:
  216. token["data"] = dict(token["data"][::-1])
  217. return token
  218. def adjustMathMLAttributes(self, token):
  219. replacements = {"definitionurl": "definitionURL"}
  220. for k, v in replacements.items():
  221. if k in token["data"]:
  222. token["data"][v] = token["data"][k]
  223. del token["data"][k]
  224. def adjustSVGAttributes(self, token):
  225. replacements = {
  226. "attributename": "attributeName",
  227. "attributetype": "attributeType",
  228. "basefrequency": "baseFrequency",
  229. "baseprofile": "baseProfile",
  230. "calcmode": "calcMode",
  231. "clippathunits": "clipPathUnits",
  232. "contentscripttype": "contentScriptType",
  233. "contentstyletype": "contentStyleType",
  234. "diffuseconstant": "diffuseConstant",
  235. "edgemode": "edgeMode",
  236. "externalresourcesrequired": "externalResourcesRequired",
  237. "filterres": "filterRes",
  238. "filterunits": "filterUnits",
  239. "glyphref": "glyphRef",
  240. "gradienttransform": "gradientTransform",
  241. "gradientunits": "gradientUnits",
  242. "kernelmatrix": "kernelMatrix",
  243. "kernelunitlength": "kernelUnitLength",
  244. "keypoints": "keyPoints",
  245. "keysplines": "keySplines",
  246. "keytimes": "keyTimes",
  247. "lengthadjust": "lengthAdjust",
  248. "limitingconeangle": "limitingConeAngle",
  249. "markerheight": "markerHeight",
  250. "markerunits": "markerUnits",
  251. "markerwidth": "markerWidth",
  252. "maskcontentunits": "maskContentUnits",
  253. "maskunits": "maskUnits",
  254. "numoctaves": "numOctaves",
  255. "pathlength": "pathLength",
  256. "patterncontentunits": "patternContentUnits",
  257. "patterntransform": "patternTransform",
  258. "patternunits": "patternUnits",
  259. "pointsatx": "pointsAtX",
  260. "pointsaty": "pointsAtY",
  261. "pointsatz": "pointsAtZ",
  262. "preservealpha": "preserveAlpha",
  263. "preserveaspectratio": "preserveAspectRatio",
  264. "primitiveunits": "primitiveUnits",
  265. "refx": "refX",
  266. "refy": "refY",
  267. "repeatcount": "repeatCount",
  268. "repeatdur": "repeatDur",
  269. "requiredextensions": "requiredExtensions",
  270. "requiredfeatures": "requiredFeatures",
  271. "specularconstant": "specularConstant",
  272. "specularexponent": "specularExponent",
  273. "spreadmethod": "spreadMethod",
  274. "startoffset": "startOffset",
  275. "stddeviation": "stdDeviation",
  276. "stitchtiles": "stitchTiles",
  277. "surfacescale": "surfaceScale",
  278. "systemlanguage": "systemLanguage",
  279. "tablevalues": "tableValues",
  280. "targetx": "targetX",
  281. "targety": "targetY",
  282. "textlength": "textLength",
  283. "viewbox": "viewBox",
  284. "viewtarget": "viewTarget",
  285. "xchannelselector": "xChannelSelector",
  286. "ychannelselector": "yChannelSelector",
  287. "zoomandpan": "zoomAndPan"
  288. }
  289. for originalName in list(token["data"].keys()):
  290. if originalName in replacements:
  291. svgName = replacements[originalName]
  292. token["data"][svgName] = token["data"][originalName]
  293. del token["data"][originalName]
  294. def adjustForeignAttributes(self, token):
  295. replacements = adjustForeignAttributesMap
  296. for originalName in token["data"].keys():
  297. if originalName in replacements:
  298. foreignName = replacements[originalName]
  299. token["data"][foreignName] = token["data"][originalName]
  300. del token["data"][originalName]
  301. def reparseTokenNormal(self, token):
  302. self.parser.phase()
  303. def resetInsertionMode(self):
  304. # The name of this method is mostly historical. (It's also used in the
  305. # specification.)
  306. last = False
  307. newModes = {
  308. "select": "inSelect",
  309. "td": "inCell",
  310. "th": "inCell",
  311. "tr": "inRow",
  312. "tbody": "inTableBody",
  313. "thead": "inTableBody",
  314. "tfoot": "inTableBody",
  315. "caption": "inCaption",
  316. "colgroup": "inColumnGroup",
  317. "table": "inTable",
  318. "head": "inBody",
  319. "body": "inBody",
  320. "frameset": "inFrameset",
  321. "html": "beforeHead"
  322. }
  323. for node in self.tree.openElements[::-1]:
  324. nodeName = node.name
  325. new_phase = None
  326. if node == self.tree.openElements[0]:
  327. assert self.innerHTML
  328. last = True
  329. nodeName = self.innerHTML
  330. # Check for conditions that should only happen in the innerHTML
  331. # case
  332. if nodeName in ("select", "colgroup", "head", "html"):
  333. assert self.innerHTML
  334. if not last and node.namespace != self.tree.defaultNamespace:
  335. continue
  336. if nodeName in newModes:
  337. new_phase = self.phases[newModes[nodeName]]
  338. break
  339. elif last:
  340. new_phase = self.phases["inBody"]
  341. break
  342. self.phase = new_phase
  343. def parseRCDataRawtext(self, token, contentType):
  344. """Generic RCDATA/RAWTEXT Parsing algorithm
  345. contentType - RCDATA or RAWTEXT
  346. """
  347. assert contentType in ("RAWTEXT", "RCDATA")
  348. self.tree.insertElement(token)
  349. if contentType == "RAWTEXT":
  350. self.tokenizer.state = self.tokenizer.rawtextState
  351. else:
  352. self.tokenizer.state = self.tokenizer.rcdataState
  353. self.originalPhase = self.phase
  354. self.phase = self.phases["text"]
  355. def getPhases(debug):
  356. def log(function):
  357. """Logger that records which phase processes each token"""
  358. type_names = dict((value, key) for key, value in
  359. constants.tokenTypes.items())
  360. def wrapped(self, *args, **kwargs):
  361. if function.__name__.startswith("process") and len(args) > 0:
  362. token = args[0]
  363. try:
  364. info = {"type": type_names[token['type']]}
  365. except:
  366. raise
  367. if token['type'] in constants.tagTokenTypes:
  368. info["name"] = token['name']
  369. self.parser.log.append((self.parser.tokenizer.state.__name__,
  370. self.parser.phase.__class__.__name__,
  371. self.__class__.__name__,
  372. function.__name__,
  373. info))
  374. return function(self, *args, **kwargs)
  375. else:
  376. return function(self, *args, **kwargs)
  377. return wrapped
  378. def getMetaclass(use_metaclass, metaclass_func):
  379. if use_metaclass:
  380. return method_decorator_metaclass(metaclass_func)
  381. else:
  382. return type
  383. class Phase(with_metaclass(getMetaclass(debug, log))):
  384. """Base class for helper object that implements each phase of processing
  385. """
  386. def __init__(self, parser, tree):
  387. self.parser = parser
  388. self.tree = tree
  389. def processEOF(self):
  390. raise NotImplementedError
  391. def processComment(self, token):
  392. # For most phases the following is correct. Where it's not it will be
  393. # overridden.
  394. self.tree.insertComment(token, self.tree.openElements[-1])
  395. def processDoctype(self, token):
  396. self.parser.parseError("unexpected-doctype")
  397. def processCharacters(self, token):
  398. self.tree.insertText(token["data"])
  399. def processSpaceCharacters(self, token):
  400. self.tree.insertText(token["data"])
  401. def processStartTag(self, token):
  402. return self.startTagHandler[token["name"]](token)
  403. def startTagHtml(self, token):
  404. if not self.parser.firstStartTag and token["name"] == "html":
  405. self.parser.parseError("non-html-root")
  406. # XXX Need a check here to see if the first start tag token emitted is
  407. # this token... If it's not, invoke self.parser.parseError().
  408. for attr, value in token["data"].items():
  409. if attr not in self.tree.openElements[0].attributes:
  410. self.tree.openElements[0].attributes[attr] = value
  411. self.parser.firstStartTag = False
  412. def processEndTag(self, token):
  413. return self.endTagHandler[token["name"]](token)
  414. class InitialPhase(Phase):
  415. def processSpaceCharacters(self, token):
  416. pass
  417. def processComment(self, token):
  418. self.tree.insertComment(token, self.tree.document)
  419. def processDoctype(self, token):
  420. name = token["name"]
  421. publicId = token["publicId"]
  422. systemId = token["systemId"]
  423. correct = token["correct"]
  424. if (name != "html" or publicId is not None or
  425. systemId is not None and systemId != "about:legacy-compat"):
  426. self.parser.parseError("unknown-doctype")
  427. if publicId is None:
  428. publicId = ""
  429. self.tree.insertDoctype(token)
  430. if publicId != "":
  431. publicId = publicId.translate(asciiUpper2Lower)
  432. if (not correct or token["name"] != "html"
  433. or publicId.startswith(
  434. ("+//silmaril//dtd html pro v0r11 19970101//",
  435. "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
  436. "-//as//dtd html 3.0 aswedit + extensions//",
  437. "-//ietf//dtd html 2.0 level 1//",
  438. "-//ietf//dtd html 2.0 level 2//",
  439. "-//ietf//dtd html 2.0 strict level 1//",
  440. "-//ietf//dtd html 2.0 strict level 2//",
  441. "-//ietf//dtd html 2.0 strict//",
  442. "-//ietf//dtd html 2.0//",
  443. "-//ietf//dtd html 2.1e//",
  444. "-//ietf//dtd html 3.0//",
  445. "-//ietf//dtd html 3.2 final//",
  446. "-//ietf//dtd html 3.2//",
  447. "-//ietf//dtd html 3//",
  448. "-//ietf//dtd html level 0//",
  449. "-//ietf//dtd html level 1//",
  450. "-//ietf//dtd html level 2//",
  451. "-//ietf//dtd html level 3//",
  452. "-//ietf//dtd html strict level 0//",
  453. "-//ietf//dtd html strict level 1//",
  454. "-//ietf//dtd html strict level 2//",
  455. "-//ietf//dtd html strict level 3//",
  456. "-//ietf//dtd html strict//",
  457. "-//ietf//dtd html//",
  458. "-//metrius//dtd metrius presentational//",
  459. "-//microsoft//dtd internet explorer 2.0 html strict//",
  460. "-//microsoft//dtd internet explorer 2.0 html//",
  461. "-//microsoft//dtd internet explorer 2.0 tables//",
  462. "-//microsoft//dtd internet explorer 3.0 html strict//",
  463. "-//microsoft//dtd internet explorer 3.0 html//",
  464. "-//microsoft//dtd internet explorer 3.0 tables//",
  465. "-//netscape comm. corp.//dtd html//",
  466. "-//netscape comm. corp.//dtd strict html//",
  467. "-//o'reilly and associates//dtd html 2.0//",
  468. "-//o'reilly and associates//dtd html extended 1.0//",
  469. "-//o'reilly and associates//dtd html extended relaxed 1.0//",
  470. "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
  471. "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
  472. "-//spyglass//dtd html 2.0 extended//",
  473. "-//sq//dtd html 2.0 hotmetal + extensions//",
  474. "-//sun microsystems corp.//dtd hotjava html//",
  475. "-//sun microsystems corp.//dtd hotjava strict html//",
  476. "-//w3c//dtd html 3 1995-03-24//",
  477. "-//w3c//dtd html 3.2 draft//",
  478. "-//w3c//dtd html 3.2 final//",
  479. "-//w3c//dtd html 3.2//",
  480. "-//w3c//dtd html 3.2s draft//",
  481. "-//w3c//dtd html 4.0 frameset//",
  482. "-//w3c//dtd html 4.0 transitional//",
  483. "-//w3c//dtd html experimental 19960712//",
  484. "-//w3c//dtd html experimental 970421//",
  485. "-//w3c//dtd w3 html//",
  486. "-//w3o//dtd w3 html 3.0//",
  487. "-//webtechs//dtd mozilla html 2.0//",
  488. "-//webtechs//dtd mozilla html//"))
  489. or publicId in
  490. ("-//w3o//dtd w3 html strict 3.0//en//",
  491. "-/w3c/dtd html 4.0 transitional/en",
  492. "html")
  493. or publicId.startswith(
  494. ("-//w3c//dtd html 4.01 frameset//",
  495. "-//w3c//dtd html 4.01 transitional//")) and
  496. systemId is None
  497. or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
  498. self.parser.compatMode = "quirks"
  499. elif (publicId.startswith(
  500. ("-//w3c//dtd xhtml 1.0 frameset//",
  501. "-//w3c//dtd xhtml 1.0 transitional//"))
  502. or publicId.startswith(
  503. ("-//w3c//dtd html 4.01 frameset//",
  504. "-//w3c//dtd html 4.01 transitional//")) and
  505. systemId is not None):
  506. self.parser.compatMode = "limited quirks"
  507. self.parser.phase = self.parser.phases["beforeHtml"]
  508. def anythingElse(self):
  509. self.parser.compatMode = "quirks"
  510. self.parser.phase = self.parser.phases["beforeHtml"]
  511. def processCharacters(self, token):
  512. self.parser.parseError("expected-doctype-but-got-chars")
  513. self.anythingElse()
  514. return token
  515. def processStartTag(self, token):
  516. self.parser.parseError("expected-doctype-but-got-start-tag",
  517. {"name": token["name"]})
  518. self.anythingElse()
  519. return token
  520. def processEndTag(self, token):
  521. self.parser.parseError("expected-doctype-but-got-end-tag",
  522. {"name": token["name"]})
  523. self.anythingElse()
  524. return token
  525. def processEOF(self):
  526. self.parser.parseError("expected-doctype-but-got-eof")
  527. self.anythingElse()
  528. return True
  529. class BeforeHtmlPhase(Phase):
  530. # helper methods
  531. def insertHtmlElement(self):
  532. self.tree.insertRoot(impliedTagToken("html", "StartTag"))
  533. self.parser.phase = self.parser.phases["beforeHead"]
  534. # other
  535. def processEOF(self):
  536. self.insertHtmlElement()
  537. return True
  538. def processComment(self, token):
  539. self.tree.insertComment(token, self.tree.document)
  540. def processSpaceCharacters(self, token):
  541. pass
  542. def processCharacters(self, token):
  543. self.insertHtmlElement()
  544. return token
  545. def processStartTag(self, token):
  546. if token["name"] == "html":
  547. self.parser.firstStartTag = True
  548. self.insertHtmlElement()
  549. return token
  550. def processEndTag(self, token):
  551. if token["name"] not in ("head", "body", "html", "br"):
  552. self.parser.parseError("unexpected-end-tag-before-html",
  553. {"name": token["name"]})
  554. else:
  555. self.insertHtmlElement()
  556. return token
  557. class BeforeHeadPhase(Phase):
  558. def __init__(self, parser, tree):
  559. Phase.__init__(self, parser, tree)
  560. self.startTagHandler = utils.MethodDispatcher([
  561. ("html", self.startTagHtml),
  562. ("head", self.startTagHead)
  563. ])
  564. self.startTagHandler.default = self.startTagOther
  565. self.endTagHandler = utils.MethodDispatcher([
  566. (("head", "body", "html", "br"), self.endTagImplyHead)
  567. ])
  568. self.endTagHandler.default = self.endTagOther
  569. def processEOF(self):
  570. self.startTagHead(impliedTagToken("head", "StartTag"))
  571. return True
  572. def processSpaceCharacters(self, token):
  573. pass
  574. def processCharacters(self, token):
  575. self.startTagHead(impliedTagToken("head", "StartTag"))
  576. return token
  577. def startTagHtml(self, token):
  578. return self.parser.phases["inBody"].processStartTag(token)
  579. def startTagHead(self, token):
  580. self.tree.insertElement(token)
  581. self.tree.headPointer = self.tree.openElements[-1]
  582. self.parser.phase = self.parser.phases["inHead"]
  583. def startTagOther(self, token):
  584. self.startTagHead(impliedTagToken("head", "StartTag"))
  585. return token
  586. def endTagImplyHead(self, token):
  587. self.startTagHead(impliedTagToken("head", "StartTag"))
  588. return token
  589. def endTagOther(self, token):
  590. self.parser.parseError("end-tag-after-implied-root",
  591. {"name": token["name"]})
  592. class InHeadPhase(Phase):
  593. def __init__(self, parser, tree):
  594. Phase.__init__(self, parser, tree)
  595. self.startTagHandler = utils.MethodDispatcher([
  596. ("html", self.startTagHtml),
  597. ("title", self.startTagTitle),
  598. (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
  599. ("script", self.startTagScript),
  600. (("base", "basefont", "bgsound", "command", "link"),
  601. self.startTagBaseLinkCommand),
  602. ("meta", self.startTagMeta),
  603. ("head", self.startTagHead)
  604. ])
  605. self.startTagHandler.default = self.startTagOther
  606. self. endTagHandler = utils.MethodDispatcher([
  607. ("head", self.endTagHead),
  608. (("br", "html", "body"), self.endTagHtmlBodyBr)
  609. ])
  610. self.endTagHandler.default = self.endTagOther
  611. # the real thing
  612. def processEOF(self):
  613. self.anythingElse()
  614. return True
  615. def processCharacters(self, token):
  616. self.anythingElse()
  617. return token
  618. def startTagHtml(self, token):
  619. return self.parser.phases["inBody"].processStartTag(token)
  620. def startTagHead(self, token):
  621. self.parser.parseError("two-heads-are-not-better-than-one")
  622. def startTagBaseLinkCommand(self, token):
  623. self.tree.insertElement(token)
  624. self.tree.openElements.pop()
  625. token["selfClosingAcknowledged"] = True
  626. def startTagMeta(self, token):
  627. self.tree.insertElement(token)
  628. self.tree.openElements.pop()
  629. token["selfClosingAcknowledged"] = True
  630. attributes = token["data"]
  631. if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
  632. if "charset" in attributes:
  633. self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
  634. elif ("content" in attributes and
  635. "http-equiv" in attributes and
  636. attributes["http-equiv"].lower() == "content-type"):
  637. # Encoding it as UTF-8 here is a hack, as really we should pass
  638. # the abstract Unicode string, and just use the
  639. # ContentAttrParser on that, but using UTF-8 allows all chars
  640. # to be encoded and as a ASCII-superset works.
  641. data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
  642. parser = inputstream.ContentAttrParser(data)
  643. codec = parser.parse()
  644. self.parser.tokenizer.stream.changeEncoding(codec)
  645. def startTagTitle(self, token):
  646. self.parser.parseRCDataRawtext(token, "RCDATA")
  647. def startTagNoScriptNoFramesStyle(self, token):
  648. # Need to decide whether to implement the scripting-disabled case
  649. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  650. def startTagScript(self, token):
  651. self.tree.insertElement(token)
  652. self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
  653. self.parser.originalPhase = self.parser.phase
  654. self.parser.phase = self.parser.phases["text"]
  655. def startTagOther(self, token):
  656. self.anythingElse()
  657. return token
  658. def endTagHead(self, token):
  659. node = self.parser.tree.openElements.pop()
  660. assert node.name == "head", "Expected head got %s" % node.name
  661. self.parser.phase = self.parser.phases["afterHead"]
  662. def endTagHtmlBodyBr(self, token):
  663. self.anythingElse()
  664. return token
  665. def endTagOther(self, token):
  666. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  667. def anythingElse(self):
  668. self.endTagHead(impliedTagToken("head"))
  669. # XXX If we implement a parser for which scripting is disabled we need to
  670. # implement this phase.
  671. #
  672. # class InHeadNoScriptPhase(Phase):
  673. class AfterHeadPhase(Phase):
  674. def __init__(self, parser, tree):
  675. Phase.__init__(self, parser, tree)
  676. self.startTagHandler = utils.MethodDispatcher([
  677. ("html", self.startTagHtml),
  678. ("body", self.startTagBody),
  679. ("frameset", self.startTagFrameset),
  680. (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
  681. "style", "title"),
  682. self.startTagFromHead),
  683. ("head", self.startTagHead)
  684. ])
  685. self.startTagHandler.default = self.startTagOther
  686. self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
  687. self.endTagHtmlBodyBr)])
  688. self.endTagHandler.default = self.endTagOther
  689. def processEOF(self):
  690. self.anythingElse()
  691. return True
  692. def processCharacters(self, token):
  693. self.anythingElse()
  694. return token
  695. def startTagHtml(self, token):
  696. return self.parser.phases["inBody"].processStartTag(token)
  697. def startTagBody(self, token):
  698. self.parser.framesetOK = False
  699. self.tree.insertElement(token)
  700. self.parser.phase = self.parser.phases["inBody"]
  701. def startTagFrameset(self, token):
  702. self.tree.insertElement(token)
  703. self.parser.phase = self.parser.phases["inFrameset"]
  704. def startTagFromHead(self, token):
  705. self.parser.parseError("unexpected-start-tag-out-of-my-head",
  706. {"name": token["name"]})
  707. self.tree.openElements.append(self.tree.headPointer)
  708. self.parser.phases["inHead"].processStartTag(token)
  709. for node in self.tree.openElements[::-1]:
  710. if node.name == "head":
  711. self.tree.openElements.remove(node)
  712. break
  713. def startTagHead(self, token):
  714. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  715. def startTagOther(self, token):
  716. self.anythingElse()
  717. return token
  718. def endTagHtmlBodyBr(self, token):
  719. self.anythingElse()
  720. return token
  721. def endTagOther(self, token):
  722. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  723. def anythingElse(self):
  724. self.tree.insertElement(impliedTagToken("body", "StartTag"))
  725. self.parser.phase = self.parser.phases["inBody"]
  726. self.parser.framesetOK = True
  727. class InBodyPhase(Phase):
  728. # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
  729. # the really-really-really-very crazy mode
  730. def __init__(self, parser, tree):
  731. Phase.__init__(self, parser, tree)
  732. # Keep a ref to this for special handling of whitespace in <pre>
  733. self.processSpaceCharactersNonPre = self.processSpaceCharacters
  734. self.startTagHandler = utils.MethodDispatcher([
  735. ("html", self.startTagHtml),
  736. (("base", "basefont", "bgsound", "command", "link", "meta",
  737. "script", "style", "title"),
  738. self.startTagProcessInHead),
  739. ("body", self.startTagBody),
  740. ("frameset", self.startTagFrameset),
  741. (("address", "article", "aside", "blockquote", "center", "details",
  742. "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
  743. "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
  744. "section", "summary", "ul"),
  745. self.startTagCloseP),
  746. (headingElements, self.startTagHeading),
  747. (("pre", "listing"), self.startTagPreListing),
  748. ("form", self.startTagForm),
  749. (("li", "dd", "dt"), self.startTagListItem),
  750. ("plaintext", self.startTagPlaintext),
  751. ("a", self.startTagA),
  752. (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
  753. "strong", "tt", "u"), self.startTagFormatting),
  754. ("nobr", self.startTagNobr),
  755. ("button", self.startTagButton),
  756. (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
  757. ("xmp", self.startTagXmp),
  758. ("table", self.startTagTable),
  759. (("area", "br", "embed", "img", "keygen", "wbr"),
  760. self.startTagVoidFormatting),
  761. (("param", "source", "track"), self.startTagParamSource),
  762. ("input", self.startTagInput),
  763. ("hr", self.startTagHr),
  764. ("image", self.startTagImage),
  765. ("isindex", self.startTagIsIndex),
  766. ("textarea", self.startTagTextarea),
  767. ("iframe", self.startTagIFrame),
  768. (("noembed", "noframes", "noscript"), self.startTagRawtext),
  769. ("select", self.startTagSelect),
  770. (("rp", "rt"), self.startTagRpRt),
  771. (("option", "optgroup"), self.startTagOpt),
  772. (("math"), self.startTagMath),
  773. (("svg"), self.startTagSvg),
  774. (("caption", "col", "colgroup", "frame", "head",
  775. "tbody", "td", "tfoot", "th", "thead",
  776. "tr"), self.startTagMisplaced)
  777. ])
  778. self.startTagHandler.default = self.startTagOther
  779. self.endTagHandler = utils.MethodDispatcher([
  780. ("body", self.endTagBody),
  781. ("html", self.endTagHtml),
  782. (("address", "article", "aside", "blockquote", "button", "center",
  783. "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
  784. "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
  785. "section", "summary", "ul"), self.endTagBlock),
  786. ("form", self.endTagForm),
  787. ("p", self.endTagP),
  788. (("dd", "dt", "li"), self.endTagListItem),
  789. (headingElements, self.endTagHeading),
  790. (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
  791. "strike", "strong", "tt", "u"), self.endTagFormatting),
  792. (("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
  793. ("br", self.endTagBr),
  794. ])
  795. self.endTagHandler.default = self.endTagOther
  796. def isMatchingFormattingElement(self, node1, node2):
  797. if node1.name != node2.name or node1.namespace != node2.namespace:
  798. return False
  799. elif len(node1.attributes) != len(node2.attributes):
  800. return False
  801. else:
  802. attributes1 = sorted(node1.attributes.items())
  803. attributes2 = sorted(node2.attributes.items())
  804. for attr1, attr2 in zip(attributes1, attributes2):
  805. if attr1 != attr2:
  806. return False
  807. return True
  808. # helper
  809. def addFormattingElement(self, token):
  810. self.tree.insertElement(token)
  811. element = self.tree.openElements[-1]
  812. matchingElements = []
  813. for node in self.tree.activeFormattingElements[::-1]:
  814. if node is Marker:
  815. break
  816. elif self.isMatchingFormattingElement(node, element):
  817. matchingElements.append(node)
  818. assert len(matchingElements) <= 3
  819. if len(matchingElements) == 3:
  820. self.tree.activeFormattingElements.remove(matchingElements[-1])
  821. self.tree.activeFormattingElements.append(element)
  822. # the real deal
  823. def processEOF(self):
  824. allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
  825. "tfoot", "th", "thead", "tr", "body",
  826. "html"))
  827. for node in self.tree.openElements[::-1]:
  828. if node.name not in allowed_elements:
  829. self.parser.parseError("expected-closing-tag-but-got-eof")
  830. break
  831. # Stop parsing
  832. def processSpaceCharactersDropNewline(self, token):
  833. # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
  834. # want to drop leading newlines
  835. data = token["data"]
  836. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  837. if (data.startswith("\n") and
  838. self.tree.openElements[-1].name in ("pre", "listing", "textarea")
  839. and not self.tree.openElements[-1].hasContent()):
  840. data = data[1:]
  841. if data:
  842. self.tree.reconstructActiveFormattingElements()
  843. self.tree.insertText(data)
  844. def processCharacters(self, token):
  845. if token["data"] == "\u0000":
  846. # The tokenizer should always emit null on its own
  847. return
  848. self.tree.reconstructActiveFormattingElements()
  849. self.tree.insertText(token["data"])
  850. # This must be bad for performance
  851. if (self.parser.framesetOK and
  852. any([char not in spaceCharacters
  853. for char in token["data"]])):
  854. self.parser.framesetOK = False
  855. def processSpaceCharacters(self, token):
  856. self.tree.reconstructActiveFormattingElements()
  857. self.tree.insertText(token["data"])
  858. def startTagProcessInHead(self, token):
  859. return self.parser.phases["inHead"].processStartTag(token)
  860. def startTagBody(self, token):
  861. self.parser.parseError("unexpected-start-tag", {"name": "body"})
  862. if (len(self.tree.openElements) == 1
  863. or self.tree.openElements[1].name != "body"):
  864. assert self.parser.innerHTML
  865. else:
  866. self.parser.framesetOK = False
  867. for attr, value in token["data"].items():
  868. if attr not in self.tree.openElements[1].attributes:
  869. self.tree.openElements[1].attributes[attr] = value
  870. def startTagFrameset(self, token):
  871. self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
  872. if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
  873. assert self.parser.innerHTML
  874. elif not self.parser.framesetOK:
  875. pass
  876. else:
  877. if self.tree.openElements[1].parent:
  878. self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
  879. while self.tree.openElements[-1].name != "html":
  880. self.tree.openElements.pop()
  881. self.tree.insertElement(token)
  882. self.parser.phase = self.parser.phases["inFrameset"]
  883. def startTagCloseP(self, token):
  884. if self.tree.elementInScope("p", variant="button"):
  885. self.endTagP(impliedTagToken("p"))
  886. self.tree.insertElement(token)
  887. def startTagPreListing(self, token):
  888. if self.tree.elementInScope("p", variant="button"):
  889. self.endTagP(impliedTagToken("p"))
  890. self.tree.insertElement(token)
  891. self.parser.framesetOK = False
  892. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  893. def startTagForm(self, token):
  894. if self.tree.formPointer:
  895. self.parser.parseError("unexpected-start-tag", {"name": "form"})
  896. else:
  897. if self.tree.elementInScope("p", variant="button"):
  898. self.endTagP(impliedTagToken("p"))
  899. self.tree.insertElement(token)
  900. self.tree.formPointer = self.tree.openElements[-1]
  901. def startTagListItem(self, token):
  902. self.parser.framesetOK = False
  903. stopNamesMap = {"li": ["li"],
  904. "dt": ["dt", "dd"],
  905. "dd": ["dt", "dd"]}
  906. stopNames = stopNamesMap[token["name"]]
  907. for node in reversed(self.tree.openElements):
  908. if node.name in stopNames:
  909. self.parser.phase.processEndTag(
  910. impliedTagToken(node.name, "EndTag"))
  911. break
  912. if (node.nameTuple in specialElements and
  913. node.name not in ("address", "div", "p")):
  914. break
  915. if self.tree.elementInScope("p", variant="button"):
  916. self.parser.phase.processEndTag(
  917. impliedTagToken("p", "EndTag"))
  918. self.tree.insertElement(token)
  919. def startTagPlaintext(self, token):
  920. if self.tree.elementInScope("p", variant="button"):
  921. self.endTagP(impliedTagToken("p"))
  922. self.tree.insertElement(token)
  923. self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
  924. def startTagHeading(self, token):
  925. if self.tree.elementInScope("p", variant="button"):
  926. self.endTagP(impliedTagToken("p"))
  927. if self.tree.openElements[-1].name in headingElements:
  928. self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
  929. self.tree.openElements.pop()
  930. self.tree.insertElement(token)
  931. def startTagA(self, token):
  932. afeAElement = self.tree.elementInActiveFormattingElements("a")
  933. if afeAElement:
  934. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  935. {"startName": "a", "endName": "a"})
  936. self.endTagFormatting(impliedTagToken("a"))
  937. if afeAElement in self.tree.openElements:
  938. self.tree.openElements.remove(afeAElement)
  939. if afeAElement in self.tree.activeFormattingElements:
  940. self.tree.activeFormattingElements.remove(afeAElement)
  941. self.tree.reconstructActiveFormattingElements()
  942. self.addFormattingElement(token)
  943. def startTagFormatting(self, token):
  944. self.tree.reconstructActiveFormattingElements()
  945. self.addFormattingElement(token)
  946. def startTagNobr(self, token):
  947. self.tree.reconstructActiveFormattingElements()
  948. if self.tree.elementInScope("nobr"):
  949. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  950. {"startName": "nobr", "endName": "nobr"})
  951. self.processEndTag(impliedTagToken("nobr"))
  952. # XXX Need tests that trigger the following
  953. self.tree.reconstructActiveFormattingElements()
  954. self.addFormattingElement(token)
  955. def startTagButton(self, token):
  956. if self.tree.elementInScope("button"):
  957. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  958. {"startName": "button", "endName": "button"})
  959. self.processEndTag(impliedTagToken("button"))
  960. return token
  961. else:
  962. self.tree.reconstructActiveFormattingElements()
  963. self.tree.insertElement(token)
  964. self.parser.framesetOK = False
  965. def startTagAppletMarqueeObject(self, token):
  966. self.tree.reconstructActiveFormattingElements()
  967. self.tree.insertElement(token)
  968. self.tree.activeFormattingElements.append(Marker)
  969. self.parser.framesetOK = False
  970. def startTagXmp(self, token):
  971. if self.tree.elementInScope("p", variant="button"):
  972. self.endTagP(impliedTagToken("p"))
  973. self.tree.reconstructActiveFormattingElements()
  974. self.parser.framesetOK = False
  975. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  976. def startTagTable(self, token):
  977. if self.parser.compatMode != "quirks":
  978. if self.tree.elementInScope("p", variant="button"):
  979. self.processEndTag(impliedTagToken("p"))
  980. self.tree.insertElement(token)
  981. self.parser.framesetOK = False
  982. self.parser.phase = self.parser.phases["inTable"]
  983. def startTagVoidFormatting(self, token):
  984. self.tree.reconstructActiveFormattingElements()
  985. self.tree.insertElement(token)
  986. self.tree.openElements.pop()
  987. token["selfClosingAcknowledged"] = True
  988. self.parser.framesetOK = False
  989. def startTagInput(self, token):
  990. framesetOK = self.parser.framesetOK
  991. self.startTagVoidFormatting(token)
  992. if ("type" in token["data"] and
  993. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  994. # input type=hidden doesn't change framesetOK
  995. self.parser.framesetOK = framesetOK
  996. def startTagParamSource(self, token):
  997. self.tree.insertElement(token)
  998. self.tree.openElements.pop()
  999. token["selfClosingAcknowledged"] = True
  1000. def startTagHr(self, token):
  1001. if self.tree.elementInScope("p", variant="button"):
  1002. self.endTagP(impliedTagToken("p"))
  1003. self.tree.insertElement(token)
  1004. self.tree.openElements.pop()
  1005. token["selfClosingAcknowledged"] = True
  1006. self.parser.framesetOK = False
  1007. def startTagImage(self, token):
  1008. # No really...
  1009. self.parser.parseError("unexpected-start-tag-treated-as",
  1010. {"originalName": "image", "newName": "img"})
  1011. self.processStartTag(impliedTagToken("img", "StartTag",
  1012. attributes=token["data"],
  1013. selfClosing=token["selfClosing"]))
  1014. def startTagIsIndex(self, token):
  1015. self.parser.parseError("deprecated-tag", {"name": "isindex"})
  1016. if self.tree.formPointer:
  1017. return
  1018. form_attrs = {}
  1019. if "action" in token["data"]:
  1020. form_attrs["action"] = token["data"]["action"]
  1021. self.processStartTag(impliedTagToken("form", "StartTag",
  1022. attributes=form_attrs))
  1023. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1024. self.processStartTag(impliedTagToken("label", "StartTag"))
  1025. # XXX Localization ...
  1026. if "prompt" in token["data"]:
  1027. prompt = token["data"]["prompt"]
  1028. else:
  1029. prompt = "This is a searchable index. Enter search keywords: "
  1030. self.processCharacters(
  1031. {"type": tokenTypes["Characters"], "data": prompt})
  1032. attributes = token["data"].copy()
  1033. if "action" in attributes:
  1034. del attributes["action"]
  1035. if "prompt" in attributes:
  1036. del attributes["prompt"]
  1037. attributes["name"] = "isindex"
  1038. self.processStartTag(impliedTagToken("input", "StartTag",
  1039. attributes=attributes,
  1040. selfClosing=token["selfClosing"]))
  1041. self.processEndTag(impliedTagToken("label"))
  1042. self.processStartTag(impliedTagToken("hr", "StartTag"))
  1043. self.processEndTag(impliedTagToken("form"))
  1044. def startTagTextarea(self, token):
  1045. self.tree.insertElement(token)
  1046. self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
  1047. self.processSpaceCharacters = self.processSpaceCharactersDropNewline
  1048. self.parser.framesetOK = False
  1049. def startTagIFrame(self, token):
  1050. self.parser.framesetOK = False
  1051. self.startTagRawtext(token)
  1052. def startTagRawtext(self, token):
  1053. """iframe, noembed noframes, noscript(if scripting enabled)"""
  1054. self.parser.parseRCDataRawtext(token, "RAWTEXT")
  1055. def startTagOpt(self, token):
  1056. if self.tree.openElements[-1].name == "option":
  1057. self.parser.phase.processEndTag(impliedTagToken("option"))
  1058. self.tree.reconstructActiveFormattingElements()
  1059. self.parser.tree.insertElement(token)
  1060. def startTagSelect(self, token):
  1061. self.tree.reconstructActiveFormattingElements()
  1062. self.tree.insertElement(token)
  1063. self.parser.framesetOK = False
  1064. if self.parser.phase in (self.parser.phases["inTable"],
  1065. self.parser.phases["inCaption"],
  1066. self.parser.phases["inColumnGroup"],
  1067. self.parser.phases["inTableBody"],
  1068. self.parser.phases["inRow"],
  1069. self.parser.phases["inCell"]):
  1070. self.parser.phase = self.parser.phases["inSelectInTable"]
  1071. else:
  1072. self.parser.phase = self.parser.phases["inSelect"]
  1073. def startTagRpRt(self, token):
  1074. if self.tree.elementInScope("ruby"):
  1075. self.tree.generateImpliedEndTags()
  1076. if self.tree.openElements[-1].name != "ruby":
  1077. self.parser.parseError()
  1078. self.tree.insertElement(token)
  1079. def startTagMath(self, token):
  1080. self.tree.reconstructActiveFormattingElements()
  1081. self.parser.adjustMathMLAttributes(token)
  1082. self.parser.adjustForeignAttributes(token)
  1083. token["namespace"] = namespaces["mathml"]
  1084. self.tree.insertElement(token)
  1085. # Need to get the parse error right for the case where the token
  1086. # has a namespace not equal to the xmlns attribute
  1087. if token["selfClosing"]:
  1088. self.tree.openElements.pop()
  1089. token["selfClosingAcknowledged"] = True
  1090. def startTagSvg(self, token):
  1091. self.tree.reconstructActiveFormattingElements()
  1092. self.parser.adjustSVGAttributes(token)
  1093. self.parser.adjustForeignAttributes(token)
  1094. token["namespace"] = namespaces["svg"]
  1095. self.tree.insertElement(token)
  1096. # Need to get the parse error right for the case where the token
  1097. # has a namespace not equal to the xmlns attribute
  1098. if token["selfClosing"]:
  1099. self.tree.openElements.pop()
  1100. token["selfClosingAcknowledged"] = True
  1101. def startTagMisplaced(self, token):
  1102. """ Elements that should be children of other elements that have a
  1103. different insertion mode; here they are ignored
  1104. "caption", "col", "colgroup", "frame", "frameset", "head",
  1105. "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
  1106. "tr", "noscript"
  1107. """
  1108. self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
  1109. def startTagOther(self, token):
  1110. self.tree.reconstructActiveFormattingElements()
  1111. self.tree.insertElement(token)
  1112. def endTagP(self, token):
  1113. if not self.tree.elementInScope("p", variant="button"):
  1114. self.startTagCloseP(impliedTagToken("p", "StartTag"))
  1115. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1116. self.endTagP(impliedTagToken("p", "EndTag"))
  1117. else:
  1118. self.tree.generateImpliedEndTags("p")
  1119. if self.tree.openElements[-1].name != "p":
  1120. self.parser.parseError("unexpected-end-tag", {"name": "p"})
  1121. node = self.tree.openElements.pop()
  1122. while node.name != "p":
  1123. node = self.tree.openElements.pop()
  1124. def endTagBody(self, token):
  1125. if not self.tree.elementInScope("body"):
  1126. self.parser.parseError()
  1127. return
  1128. elif self.tree.openElements[-1].name != "body":
  1129. for node in self.tree.openElements[2:]:
  1130. if node.name not in frozenset(("dd", "dt", "li", "optgroup",
  1131. "option", "p", "rp", "rt",
  1132. "tbody", "td", "tfoot",
  1133. "th", "thead", "tr", "body",
  1134. "html")):
  1135. # Not sure this is the correct name for the parse error
  1136. self.parser.parseError(
  1137. "expected-one-end-tag-but-got-another",
  1138. {"expectedName": "body", "gotName": node.name})
  1139. break
  1140. self.parser.phase = self.parser.phases["afterBody"]
  1141. def endTagHtml(self, token):
  1142. # We repeat the test for the body end tag token being ignored here
  1143. if self.tree.elementInScope("body"):
  1144. self.endTagBody(impliedTagToken("body"))
  1145. return token
  1146. def endTagBlock(self, token):
  1147. # Put us back in the right whitespace handling mode
  1148. if token["name"] == "pre":
  1149. self.processSpaceCharacters = self.processSpaceCharactersNonPre
  1150. inScope = self.tree.elementInScope(token["name"])
  1151. if inScope:
  1152. self.tree.generateImpliedEndTags()
  1153. if self.tree.openElements[-1].name != token["name"]:
  1154. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1155. if inScope:
  1156. node = self.tree.openElements.pop()
  1157. while node.name != token["name"]:
  1158. node = self.tree.openElements.pop()
  1159. def endTagForm(self, token):
  1160. node = self.tree.formPointer
  1161. self.tree.formPointer = None
  1162. if node is None or not self.tree.elementInScope(node):
  1163. self.parser.parseError("unexpected-end-tag",
  1164. {"name": "form"})
  1165. else:
  1166. self.tree.generateImpliedEndTags()
  1167. if self.tree.openElements[-1] != node:
  1168. self.parser.parseError("end-tag-too-early-ignored",
  1169. {"name": "form"})
  1170. self.tree.openElements.remove(node)
  1171. def endTagListItem(self, token):
  1172. if token["name"] == "li":
  1173. variant = "list"
  1174. else:
  1175. variant = None
  1176. if not self.tree.elementInScope(token["name"], variant=variant):
  1177. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1178. else:
  1179. self.tree.generateImpliedEndTags(exclude=token["name"])
  1180. if self.tree.openElements[-1].name != token["name"]:
  1181. self.parser.parseError(
  1182. "end-tag-too-early",
  1183. {"name": token["name"]})
  1184. node = self.tree.openElements.pop()
  1185. while node.name != token["name"]:
  1186. node = self.tree.openElements.pop()
  1187. def endTagHeading(self, token):
  1188. for item in headingElements:
  1189. if self.tree.elementInScope(item):
  1190. self.tree.generateImpliedEndTags()
  1191. break
  1192. if self.tree.openElements[-1].name != token["name"]:
  1193. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1194. for item in headingElements:
  1195. if self.tree.elementInScope(item):
  1196. item = self.tree.openElements.pop()
  1197. while item.name not in headingElements:
  1198. item = self.tree.openElements.pop()
  1199. break
  1200. def endTagFormatting(self, token):
  1201. """The much-feared adoption agency algorithm"""
  1202. # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
  1203. # XXX Better parseError messages appreciated.
  1204. # Step 1
  1205. outerLoopCounter = 0
  1206. # Step 2
  1207. while outerLoopCounter < 8:
  1208. # Step 3
  1209. outerLoopCounter += 1
  1210. # Step 4:
  1211. # Let the formatting element be the last element in
  1212. # the list of active formatting elements that:
  1213. # - is between the end of the list and the last scope
  1214. # marker in the list, if any, or the start of the list
  1215. # otherwise, and
  1216. # - has the same tag name as the token.
  1217. formattingElement = self.tree.elementInActiveFormattingElements(
  1218. token["name"])
  1219. if (not formattingElement or
  1220. (formattingElement in self.tree.openElements and
  1221. not self.tree.elementInScope(formattingElement.name))):
  1222. # If there is no such node, then abort these steps
  1223. # and instead act as described in the "any other
  1224. # end tag" entry below.
  1225. self.endTagOther(token)
  1226. return
  1227. # Otherwise, if there is such a node, but that node is
  1228. # not in the stack of open elements, then this is a
  1229. # parse error; remove the element from the list, and
  1230. # abort these steps.
  1231. elif formattingElement not in self.tree.openElements:
  1232. self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
  1233. self.tree.activeFormattingElements.remove(formattingElement)
  1234. return
  1235. # Otherwise, if there is such a node, and that node is
  1236. # also in the stack of open elements, but the element
  1237. # is not in scope, then this is a parse error; ignore
  1238. # the token, and abort these steps.
  1239. elif not self.tree.elementInScope(formattingElement.name):
  1240. self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
  1241. return
  1242. # Otherwise, there is a formatting element and that
  1243. # element is in the stack and is in scope. If the
  1244. # element is not the current node, this is a parse
  1245. # error. In any case, proceed with the algorithm as
  1246. # written in the following steps.
  1247. else:
  1248. if formattingElement != self.tree.openElements[-1]:
  1249. self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
  1250. # Step 5:
  1251. # Let the furthest block be the topmost node in the
  1252. # stack of open elements that is lower in the stack
  1253. # than the formatting element, and is an element in
  1254. # the special category. There might not be one.
  1255. afeIndex = self.tree.openElements.index(formattingElement)
  1256. furthestBlock = None
  1257. for element in self.tree.openElements[afeIndex:]:
  1258. if element.nameTuple in specialElements:
  1259. furthestBlock = element
  1260. break
  1261. # Step 6:
  1262. # If there is no furthest block, then the UA must
  1263. # first pop all the nodes from the bottom of the stack
  1264. # of open elements, from the current node up to and
  1265. # including the formatting element, then remove the
  1266. # formatting element from the list of active
  1267. # formatting elements, and finally abort these steps.
  1268. if furthestBlock is None:
  1269. element = self.tree.openElements.pop()
  1270. while element != formattingElement:
  1271. element = self.tree.openElements.pop()
  1272. self.tree.activeFormattingElements.remove(element)
  1273. return
  1274. # Step 7
  1275. commonAncestor = self.tree.openElements[afeIndex - 1]
  1276. # Step 8:
  1277. # The bookmark is supposed to help us identify where to reinsert
  1278. # nodes in step 15. We have to ensure that we reinsert nodes after
  1279. # the node before the active formatting element. Note the bookmark
  1280. # can move in step 9.7
  1281. bookmark = self.tree.activeFormattingElements.index(formattingElement)
  1282. # Step 9
  1283. lastNode = node = furthestBlock
  1284. innerLoopCounter = 0
  1285. index = self.tree.openElements.index(node)
  1286. while innerLoopCounter < 3:
  1287. innerLoopCounter += 1
  1288. # Node is element before node in open elements
  1289. index -= 1
  1290. node = self.tree.openElements[index]
  1291. if node not in self.tree.activeFormattingElements:
  1292. self.tree.openElements.remove(node)
  1293. continue
  1294. # Step 9.6
  1295. if node == formattingElement:
  1296. break
  1297. # Step 9.7
  1298. if lastNode == furthestBlock:
  1299. bookmark = self.tree.activeFormattingElements.index(node) + 1
  1300. # Step 9.8
  1301. clone = node.cloneNode()
  1302. # Replace node with clone
  1303. self.tree.activeFormattingElements[
  1304. self.tree.activeFormattingElements.index(node)] = clone
  1305. self.tree.openElements[
  1306. self.tree.openElements.index(node)] = clone
  1307. node = clone
  1308. # Step 9.9
  1309. # Remove lastNode from its parents, if any
  1310. if lastNode.parent:
  1311. lastNode.parent.removeChild(lastNode)
  1312. node.appendChild(lastNode)
  1313. # Step 9.10
  1314. lastNode = node
  1315. # Step 10
  1316. # Foster parent lastNode if commonAncestor is a
  1317. # table, tbody, tfoot, thead, or tr we need to foster
  1318. # parent the lastNode
  1319. if lastNode.parent:
  1320. lastNode.parent.removeChild(lastNode)
  1321. if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
  1322. parent, insertBefore = self.tree.getTableMisnestedNodePosition()
  1323. parent.insertBefore(lastNode, insertBefore)
  1324. else:
  1325. commonAncestor.appendChild(lastNode)
  1326. # Step 11
  1327. clone = formattingElement.cloneNode()
  1328. # Step 12
  1329. furthestBlock.reparentChildren(clone)
  1330. # Step 13
  1331. furthestBlock.appendChild(clone)
  1332. # Step 14
  1333. self.tree.activeFormattingElements.remove(formattingElement)
  1334. self.tree.activeFormattingElements.insert(bookmark, clone)
  1335. # Step 15
  1336. self.tree.openElements.remove(formattingElement)
  1337. self.tree.openElements.insert(
  1338. self.tree.openElements.index(furthestBlock) + 1, clone)
  1339. def endTagAppletMarqueeObject(self, token):
  1340. if self.tree.elementInScope(token["name"]):
  1341. self.tree.generateImpliedEndTags()
  1342. if self.tree.openElements[-1].name != token["name"]:
  1343. self.parser.parseError("end-tag-too-early", {"name": token["name"]})
  1344. if self.tree.elementInScope(token["name"]):
  1345. element = self.tree.openElements.pop()
  1346. while element.name != token["name"]:
  1347. element = self.tree.openElements.pop()
  1348. self.tree.clearActiveFormattingElements()
  1349. def endTagBr(self, token):
  1350. self.parser.parseError("unexpected-end-tag-treated-as",
  1351. {"originalName": "br", "newName": "br element"})
  1352. self.tree.reconstructActiveFormattingElements()
  1353. self.tree.insertElement(impliedTagToken("br", "StartTag"))
  1354. self.tree.openElements.pop()
  1355. def endTagOther(self, token):
  1356. for node in self.tree.openElements[::-1]:
  1357. if node.name == token["name"]:
  1358. self.tree.generateImpliedEndTags(exclude=token["name"])
  1359. if self.tree.openElements[-1].name != token["name"]:
  1360. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1361. while self.tree.openElements.pop() != node:
  1362. pass
  1363. break
  1364. else:
  1365. if node.nameTuple in specialElements:
  1366. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1367. break
  1368. class TextPhase(Phase):
  1369. def __init__(self, parser, tree):
  1370. Phase.__init__(self, parser, tree)
  1371. self.startTagHandler = utils.MethodDispatcher([])
  1372. self.startTagHandler.default = self.startTagOther
  1373. self.endTagHandler = utils.MethodDispatcher([
  1374. ("script", self.endTagScript)])
  1375. self.endTagHandler.default = self.endTagOther
  1376. def processCharacters(self, token):
  1377. self.tree.insertText(token["data"])
  1378. def processEOF(self):
  1379. self.parser.parseError("expected-named-closing-tag-but-got-eof",
  1380. {"name": self.tree.openElements[-1].name})
  1381. self.tree.openElements.pop()
  1382. self.parser.phase = self.parser.originalPhase
  1383. return True
  1384. def startTagOther(self, token):
  1385. assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
  1386. def endTagScript(self, token):
  1387. node = self.tree.openElements.pop()
  1388. assert node.name == "script"
  1389. self.parser.phase = self.parser.originalPhase
  1390. # The rest of this method is all stuff that only happens if
  1391. # document.write works
  1392. def endTagOther(self, token):
  1393. self.tree.openElements.pop()
  1394. self.parser.phase = self.parser.originalPhase
  1395. class InTablePhase(Phase):
  1396. # http://www.whatwg.org/specs/web-apps/current-work/#in-table
  1397. def __init__(self, parser, tree):
  1398. Phase.__init__(self, parser, tree)
  1399. self.startTagHandler = utils.MethodDispatcher([
  1400. ("html", self.startTagHtml),
  1401. ("caption", self.startTagCaption),
  1402. ("colgroup", self.startTagColgroup),
  1403. ("col", self.startTagCol),
  1404. (("tbody", "tfoot", "thead"), self.startTagRowGroup),
  1405. (("td", "th", "tr"), self.startTagImplyTbody),
  1406. ("table", self.startTagTable),
  1407. (("style", "script"), self.startTagStyleScript),
  1408. ("input", self.startTagInput),
  1409. ("form", self.startTagForm)
  1410. ])
  1411. self.startTagHandler.default = self.startTagOther
  1412. self.endTagHandler = utils.MethodDispatcher([
  1413. ("table", self.endTagTable),
  1414. (("body", "caption", "col", "colgroup", "html", "tbody", "td",
  1415. "tfoot", "th", "thead", "tr"), self.endTagIgnore)
  1416. ])
  1417. self.endTagHandler.default = self.endTagOther
  1418. # helper methods
  1419. def clearStackToTableContext(self):
  1420. # "clear the stack back to a table context"
  1421. while self.tree.openElements[-1].name not in ("table", "html"):
  1422. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1423. # {"name": self.tree.openElements[-1].name})
  1424. self.tree.openElements.pop()
  1425. # When the current node is <html> it's an innerHTML case
  1426. # processing methods
  1427. def processEOF(self):
  1428. if self.tree.openElements[-1].name != "html":
  1429. self.parser.parseError("eof-in-table")
  1430. else:
  1431. assert self.parser.innerHTML
  1432. # Stop parsing
  1433. def processSpaceCharacters(self, token):
  1434. originalPhase = self.parser.phase
  1435. self.parser.phase = self.parser.phases["inTableText"]
  1436. self.parser.phase.originalPhase = originalPhase
  1437. self.parser.phase.processSpaceCharacters(token)
  1438. def processCharacters(self, token):
  1439. originalPhase = self.parser.phase
  1440. self.parser.phase = self.parser.phases["inTableText"]
  1441. self.parser.phase.originalPhase = originalPhase
  1442. self.parser.phase.processCharacters(token)
  1443. def insertText(self, token):
  1444. # If we get here there must be at least one non-whitespace character
  1445. # Do the table magic!
  1446. self.tree.insertFromTable = True
  1447. self.parser.phases["inBody"].processCharacters(token)
  1448. self.tree.insertFromTable = False
  1449. def startTagCaption(self, token):
  1450. self.clearStackToTableContext()
  1451. self.tree.activeFormattingElements.append(Marker)
  1452. self.tree.insertElement(token)
  1453. self.parser.phase = self.parser.phases["inCaption"]
  1454. def startTagColgroup(self, token):
  1455. self.clearStackToTableContext()
  1456. self.tree.insertElement(token)
  1457. self.parser.phase = self.parser.phases["inColumnGroup"]
  1458. def startTagCol(self, token):
  1459. self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
  1460. return token
  1461. def startTagRowGroup(self, token):
  1462. self.clearStackToTableContext()
  1463. self.tree.insertElement(token)
  1464. self.parser.phase = self.parser.phases["inTableBody"]
  1465. def startTagImplyTbody(self, token):
  1466. self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
  1467. return token
  1468. def startTagTable(self, token):
  1469. self.parser.parseError("unexpected-start-tag-implies-end-tag",
  1470. {"startName": "table", "endName": "table"})
  1471. self.parser.phase.processEndTag(impliedTagToken("table"))
  1472. if not self.parser.innerHTML:
  1473. return token
  1474. def startTagStyleScript(self, token):
  1475. return self.parser.phases["inHead"].processStartTag(token)
  1476. def startTagInput(self, token):
  1477. if ("type" in token["data"] and
  1478. token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
  1479. self.parser.parseError("unexpected-hidden-input-in-table")
  1480. self.tree.insertElement(token)
  1481. # XXX associate with form
  1482. self.tree.openElements.pop()
  1483. else:
  1484. self.startTagOther(token)
  1485. def startTagForm(self, token):
  1486. self.parser.parseError("unexpected-form-in-table")
  1487. if self.tree.formPointer is None:
  1488. self.tree.insertElement(token)
  1489. self.tree.formPointer = self.tree.openElements[-1]
  1490. self.tree.openElements.pop()
  1491. def startTagOther(self, token):
  1492. self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
  1493. # Do the table magic!
  1494. self.tree.insertFromTable = True
  1495. self.parser.phases["inBody"].processStartTag(token)
  1496. self.tree.insertFromTable = False
  1497. def endTagTable(self, token):
  1498. if self.tree.elementInScope("table", variant="table"):
  1499. self.tree.generateImpliedEndTags()
  1500. if self.tree.openElements[-1].name != "table":
  1501. self.parser.parseError("end-tag-too-early-named",
  1502. {"gotName": "table",
  1503. "expectedName": self.tree.openElements[-1].name})
  1504. while self.tree.openElements[-1].name != "table":
  1505. self.tree.openElements.pop()
  1506. self.tree.openElements.pop()
  1507. self.parser.resetInsertionMode()
  1508. else:
  1509. # innerHTML case
  1510. assert self.parser.innerHTML
  1511. self.parser.parseError()
  1512. def endTagIgnore(self, token):
  1513. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1514. def endTagOther(self, token):
  1515. self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
  1516. # Do the table magic!
  1517. self.tree.insertFromTable = True
  1518. self.parser.phases["inBody"].processEndTag(token)
  1519. self.tree.insertFromTable = False
  1520. class InTableTextPhase(Phase):
  1521. def __init__(self, parser, tree):
  1522. Phase.__init__(self, parser, tree)
  1523. self.originalPhase = None
  1524. self.characterTokens = []
  1525. def flushCharacters(self):
  1526. data = "".join([item["data"] for item in self.characterTokens])
  1527. if any([item not in spaceCharacters for item in data]):
  1528. token = {"type": tokenTypes["Characters"], "data": data}
  1529. self.parser.phases["inTable"].insertText(token)
  1530. elif data:
  1531. self.tree.insertText(data)
  1532. self.characterTokens = []
  1533. def processComment(self, token):
  1534. self.flushCharacters()
  1535. self.parser.phase = self.originalPhase
  1536. return token
  1537. def processEOF(self):
  1538. self.flushCharacters()
  1539. self.parser.phase = self.originalPhase
  1540. return True
  1541. def processCharacters(self, token):
  1542. if token["data"] == "\u0000":
  1543. return
  1544. self.characterTokens.append(token)
  1545. def processSpaceCharacters(self, token):
  1546. # pretty sure we should never reach here
  1547. self.characterTokens.append(token)
  1548. # assert False
  1549. def processStartTag(self, token):
  1550. self.flushCharacters()
  1551. self.parser.phase = self.originalPhase
  1552. return token
  1553. def processEndTag(self, token):
  1554. self.flushCharacters()
  1555. self.parser.phase = self.originalPhase
  1556. return token
  1557. class InCaptionPhase(Phase):
  1558. # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
  1559. def __init__(self, parser, tree):
  1560. Phase.__init__(self, parser, tree)
  1561. self.startTagHandler = utils.MethodDispatcher([
  1562. ("html", self.startTagHtml),
  1563. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1564. "thead", "tr"), self.startTagTableElement)
  1565. ])
  1566. self.startTagHandler.default = self.startTagOther
  1567. self.endTagHandler = utils.MethodDispatcher([
  1568. ("caption", self.endTagCaption),
  1569. ("table", self.endTagTable),
  1570. (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
  1571. "thead", "tr"), self.endTagIgnore)
  1572. ])
  1573. self.endTagHandler.default = self.endTagOther
  1574. def ignoreEndTagCaption(self):
  1575. return not self.tree.elementInScope("caption", variant="table")
  1576. def processEOF(self):
  1577. self.parser.phases["inBody"].processEOF()
  1578. def processCharacters(self, token):
  1579. return self.parser.phases["inBody"].processCharacters(token)
  1580. def startTagTableElement(self, token):
  1581. self.parser.parseError()
  1582. # XXX Have to duplicate logic here to find out if the tag is ignored
  1583. ignoreEndTag = self.ignoreEndTagCaption()
  1584. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1585. if not ignoreEndTag:
  1586. return token
  1587. def startTagOther(self, token):
  1588. return self.parser.phases["inBody"].processStartTag(token)
  1589. def endTagCaption(self, token):
  1590. if not self.ignoreEndTagCaption():
  1591. # AT this code is quite similar to endTagTable in "InTable"
  1592. self.tree.generateImpliedEndTags()
  1593. if self.tree.openElements[-1].name != "caption":
  1594. self.parser.parseError("expected-one-end-tag-but-got-another",
  1595. {"gotName": "caption",
  1596. "expectedName": self.tree.openElements[-1].name})
  1597. while self.tree.openElements[-1].name != "caption":
  1598. self.tree.openElements.pop()
  1599. self.tree.openElements.pop()
  1600. self.tree.clearActiveFormattingElements()
  1601. self.parser.phase = self.parser.phases["inTable"]
  1602. else:
  1603. # innerHTML case
  1604. assert self.parser.innerHTML
  1605. self.parser.parseError()
  1606. def endTagTable(self, token):
  1607. self.parser.parseError()
  1608. ignoreEndTag = self.ignoreEndTagCaption()
  1609. self.parser.phase.processEndTag(impliedTagToken("caption"))
  1610. if not ignoreEndTag:
  1611. return token
  1612. def endTagIgnore(self, token):
  1613. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1614. def endTagOther(self, token):
  1615. return self.parser.phases["inBody"].processEndTag(token)
  1616. class InColumnGroupPhase(Phase):
  1617. # http://www.whatwg.org/specs/web-apps/current-work/#in-column
  1618. def __init__(self, parser, tree):
  1619. Phase.__init__(self, parser, tree)
  1620. self.startTagHandler = utils.MethodDispatcher([
  1621. ("html", self.startTagHtml),
  1622. ("col", self.startTagCol)
  1623. ])
  1624. self.startTagHandler.default = self.startTagOther
  1625. self.endTagHandler = utils.MethodDispatcher([
  1626. ("colgroup", self.endTagColgroup),
  1627. ("col", self.endTagCol)
  1628. ])
  1629. self.endTagHandler.default = self.endTagOther
  1630. def ignoreEndTagColgroup(self):
  1631. return self.tree.openElements[-1].name == "html"
  1632. def processEOF(self):
  1633. if self.tree.openElements[-1].name == "html":
  1634. assert self.parser.innerHTML
  1635. return
  1636. else:
  1637. ignoreEndTag = self.ignoreEndTagColgroup()
  1638. self.endTagColgroup(impliedTagToken("colgroup"))
  1639. if not ignoreEndTag:
  1640. return True
  1641. def processCharacters(self, token):
  1642. ignoreEndTag = self.ignoreEndTagColgroup()
  1643. self.endTagColgroup(impliedTagToken("colgroup"))
  1644. if not ignoreEndTag:
  1645. return token
  1646. def startTagCol(self, token):
  1647. self.tree.insertElement(token)
  1648. self.tree.openElements.pop()
  1649. def startTagOther(self, token):
  1650. ignoreEndTag = self.ignoreEndTagColgroup()
  1651. self.endTagColgroup(impliedTagToken("colgroup"))
  1652. if not ignoreEndTag:
  1653. return token
  1654. def endTagColgroup(self, token):
  1655. if self.ignoreEndTagColgroup():
  1656. # innerHTML case
  1657. assert self.parser.innerHTML
  1658. self.parser.parseError()
  1659. else:
  1660. self.tree.openElements.pop()
  1661. self.parser.phase = self.parser.phases["inTable"]
  1662. def endTagCol(self, token):
  1663. self.parser.parseError("no-end-tag", {"name": "col"})
  1664. def endTagOther(self, token):
  1665. ignoreEndTag = self.ignoreEndTagColgroup()
  1666. self.endTagColgroup(impliedTagToken("colgroup"))
  1667. if not ignoreEndTag:
  1668. return token
  1669. class InTableBodyPhase(Phase):
  1670. # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
  1671. def __init__(self, parser, tree):
  1672. Phase.__init__(self, parser, tree)
  1673. self.startTagHandler = utils.MethodDispatcher([
  1674. ("html", self.startTagHtml),
  1675. ("tr", self.startTagTr),
  1676. (("td", "th"), self.startTagTableCell),
  1677. (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
  1678. self.startTagTableOther)
  1679. ])
  1680. self.startTagHandler.default = self.startTagOther
  1681. self.endTagHandler = utils.MethodDispatcher([
  1682. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1683. ("table", self.endTagTable),
  1684. (("body", "caption", "col", "colgroup", "html", "td", "th",
  1685. "tr"), self.endTagIgnore)
  1686. ])
  1687. self.endTagHandler.default = self.endTagOther
  1688. # helper methods
  1689. def clearStackToTableBodyContext(self):
  1690. while self.tree.openElements[-1].name not in ("tbody", "tfoot",
  1691. "thead", "html"):
  1692. # self.parser.parseError("unexpected-implied-end-tag-in-table",
  1693. # {"name": self.tree.openElements[-1].name})
  1694. self.tree.openElements.pop()
  1695. if self.tree.openElements[-1].name == "html":
  1696. assert self.parser.innerHTML
  1697. # the rest
  1698. def processEOF(self):
  1699. self.parser.phases["inTable"].processEOF()
  1700. def processSpaceCharacters(self, token):
  1701. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1702. def processCharacters(self, token):
  1703. return self.parser.phases["inTable"].processCharacters(token)
  1704. def startTagTr(self, token):
  1705. self.clearStackToTableBodyContext()
  1706. self.tree.insertElement(token)
  1707. self.parser.phase = self.parser.phases["inRow"]
  1708. def startTagTableCell(self, token):
  1709. self.parser.parseError("unexpected-cell-in-table-body",
  1710. {"name": token["name"]})
  1711. self.startTagTr(impliedTagToken("tr", "StartTag"))
  1712. return token
  1713. def startTagTableOther(self, token):
  1714. # XXX AT Any ideas on how to share this with endTagTable?
  1715. if (self.tree.elementInScope("tbody", variant="table") or
  1716. self.tree.elementInScope("thead", variant="table") or
  1717. self.tree.elementInScope("tfoot", variant="table")):
  1718. self.clearStackToTableBodyContext()
  1719. self.endTagTableRowGroup(
  1720. impliedTagToken(self.tree.openElements[-1].name))
  1721. return token
  1722. else:
  1723. # innerHTML case
  1724. assert self.parser.innerHTML
  1725. self.parser.parseError()
  1726. def startTagOther(self, token):
  1727. return self.parser.phases["inTable"].processStartTag(token)
  1728. def endTagTableRowGroup(self, token):
  1729. if self.tree.elementInScope(token["name"], variant="table"):
  1730. self.clearStackToTableBodyContext()
  1731. self.tree.openElements.pop()
  1732. self.parser.phase = self.parser.phases["inTable"]
  1733. else:
  1734. self.parser.parseError("unexpected-end-tag-in-table-body",
  1735. {"name": token["name"]})
  1736. def endTagTable(self, token):
  1737. if (self.tree.elementInScope("tbody", variant="table") or
  1738. self.tree.elementInScope("thead", variant="table") or
  1739. self.tree.elementInScope("tfoot", variant="table")):
  1740. self.clearStackToTableBodyContext()
  1741. self.endTagTableRowGroup(
  1742. impliedTagToken(self.tree.openElements[-1].name))
  1743. return token
  1744. else:
  1745. # innerHTML case
  1746. assert self.parser.innerHTML
  1747. self.parser.parseError()
  1748. def endTagIgnore(self, token):
  1749. self.parser.parseError("unexpected-end-tag-in-table-body",
  1750. {"name": token["name"]})
  1751. def endTagOther(self, token):
  1752. return self.parser.phases["inTable"].processEndTag(token)
  1753. class InRowPhase(Phase):
  1754. # http://www.whatwg.org/specs/web-apps/current-work/#in-row
  1755. def __init__(self, parser, tree):
  1756. Phase.__init__(self, parser, tree)
  1757. self.startTagHandler = utils.MethodDispatcher([
  1758. ("html", self.startTagHtml),
  1759. (("td", "th"), self.startTagTableCell),
  1760. (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
  1761. "tr"), self.startTagTableOther)
  1762. ])
  1763. self.startTagHandler.default = self.startTagOther
  1764. self.endTagHandler = utils.MethodDispatcher([
  1765. ("tr", self.endTagTr),
  1766. ("table", self.endTagTable),
  1767. (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
  1768. (("body", "caption", "col", "colgroup", "html", "td", "th"),
  1769. self.endTagIgnore)
  1770. ])
  1771. self.endTagHandler.default = self.endTagOther
  1772. # helper methods (XXX unify this with other table helper methods)
  1773. def clearStackToTableRowContext(self):
  1774. while self.tree.openElements[-1].name not in ("tr", "html"):
  1775. self.parser.parseError("unexpected-implied-end-tag-in-table-row",
  1776. {"name": self.tree.openElements[-1].name})
  1777. self.tree.openElements.pop()
  1778. def ignoreEndTagTr(self):
  1779. return not self.tree.elementInScope("tr", variant="table")
  1780. # the rest
  1781. def processEOF(self):
  1782. self.parser.phases["inTable"].processEOF()
  1783. def processSpaceCharacters(self, token):
  1784. return self.parser.phases["inTable"].processSpaceCharacters(token)
  1785. def processCharacters(self, token):
  1786. return self.parser.phases["inTable"].processCharacters(token)
  1787. def startTagTableCell(self, token):
  1788. self.clearStackToTableRowContext()
  1789. self.tree.insertElement(token)
  1790. self.parser.phase = self.parser.phases["inCell"]
  1791. self.tree.activeFormattingElements.append(Marker)
  1792. def startTagTableOther(self, token):
  1793. ignoreEndTag = self.ignoreEndTagTr()
  1794. self.endTagTr(impliedTagToken("tr"))
  1795. # XXX how are we sure it's always ignored in the innerHTML case?
  1796. if not ignoreEndTag:
  1797. return token
  1798. def startTagOther(self, token):
  1799. return self.parser.phases["inTable"].processStartTag(token)
  1800. def endTagTr(self, token):
  1801. if not self.ignoreEndTagTr():
  1802. self.clearStackToTableRowContext()
  1803. self.tree.openElements.pop()
  1804. self.parser.phase = self.parser.phases["inTableBody"]
  1805. else:
  1806. # innerHTML case
  1807. assert self.parser.innerHTML
  1808. self.parser.parseError()
  1809. def endTagTable(self, token):
  1810. ignoreEndTag = self.ignoreEndTagTr()
  1811. self.endTagTr(impliedTagToken("tr"))
  1812. # Reprocess the current tag if the tr end tag was not ignored
  1813. # XXX how are we sure it's always ignored in the innerHTML case?
  1814. if not ignoreEndTag:
  1815. return token
  1816. def endTagTableRowGroup(self, token):
  1817. if self.tree.elementInScope(token["name"], variant="table"):
  1818. self.endTagTr(impliedTagToken("tr"))
  1819. return token
  1820. else:
  1821. self.parser.parseError()
  1822. def endTagIgnore(self, token):
  1823. self.parser.parseError("unexpected-end-tag-in-table-row",
  1824. {"name": token["name"]})
  1825. def endTagOther(self, token):
  1826. return self.parser.phases["inTable"].processEndTag(token)
  1827. class InCellPhase(Phase):
  1828. # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
  1829. def __init__(self, parser, tree):
  1830. Phase.__init__(self, parser, tree)
  1831. self.startTagHandler = utils.MethodDispatcher([
  1832. ("html", self.startTagHtml),
  1833. (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
  1834. "thead", "tr"), self.startTagTableOther)
  1835. ])
  1836. self.startTagHandler.default = self.startTagOther
  1837. self.endTagHandler = utils.MethodDispatcher([
  1838. (("td", "th"), self.endTagTableCell),
  1839. (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
  1840. (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
  1841. ])
  1842. self.endTagHandler.default = self.endTagOther
  1843. # helper
  1844. def closeCell(self):
  1845. if self.tree.elementInScope("td", variant="table"):
  1846. self.endTagTableCell(impliedTagToken("td"))
  1847. elif self.tree.elementInScope("th", variant="table"):
  1848. self.endTagTableCell(impliedTagToken("th"))
  1849. # the rest
  1850. def processEOF(self):
  1851. self.parser.phases["inBody"].processEOF()
  1852. def processCharacters(self, token):
  1853. return self.parser.phases["inBody"].processCharacters(token)
  1854. def startTagTableOther(self, token):
  1855. if (self.tree.elementInScope("td", variant="table") or
  1856. self.tree.elementInScope("th", variant="table")):
  1857. self.closeCell()
  1858. return token
  1859. else:
  1860. # innerHTML case
  1861. assert self.parser.innerHTML
  1862. self.parser.parseError()
  1863. def startTagOther(self, token):
  1864. return self.parser.phases["inBody"].processStartTag(token)
  1865. def endTagTableCell(self, token):
  1866. if self.tree.elementInScope(token["name"], variant="table"):
  1867. self.tree.generateImpliedEndTags(token["name"])
  1868. if self.tree.openElements[-1].name != token["name"]:
  1869. self.parser.parseError("unexpected-cell-end-tag",
  1870. {"name": token["name"]})
  1871. while True:
  1872. node = self.tree.openElements.pop()
  1873. if node.name == token["name"]:
  1874. break
  1875. else:
  1876. self.tree.openElements.pop()
  1877. self.tree.clearActiveFormattingElements()
  1878. self.parser.phase = self.parser.phases["inRow"]
  1879. else:
  1880. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1881. def endTagIgnore(self, token):
  1882. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  1883. def endTagImply(self, token):
  1884. if self.tree.elementInScope(token["name"], variant="table"):
  1885. self.closeCell()
  1886. return token
  1887. else:
  1888. # sometimes innerHTML case
  1889. self.parser.parseError()
  1890. def endTagOther(self, token):
  1891. return self.parser.phases["inBody"].processEndTag(token)
  1892. class InSelectPhase(Phase):
  1893. def __init__(self, parser, tree):
  1894. Phase.__init__(self, parser, tree)
  1895. self.startTagHandler = utils.MethodDispatcher([
  1896. ("html", self.startTagHtml),
  1897. ("option", self.startTagOption),
  1898. ("optgroup", self.startTagOptgroup),
  1899. ("select", self.startTagSelect),
  1900. (("input", "keygen", "textarea"), self.startTagInput),
  1901. ("script", self.startTagScript)
  1902. ])
  1903. self.startTagHandler.default = self.startTagOther
  1904. self.endTagHandler = utils.MethodDispatcher([
  1905. ("option", self.endTagOption),
  1906. ("optgroup", self.endTagOptgroup),
  1907. ("select", self.endTagSelect)
  1908. ])
  1909. self.endTagHandler.default = self.endTagOther
  1910. # http://www.whatwg.org/specs/web-apps/current-work/#in-select
  1911. def processEOF(self):
  1912. if self.tree.openElements[-1].name != "html":
  1913. self.parser.parseError("eof-in-select")
  1914. else:
  1915. assert self.parser.innerHTML
  1916. def processCharacters(self, token):
  1917. if token["data"] == "\u0000":
  1918. return
  1919. self.tree.insertText(token["data"])
  1920. def startTagOption(self, token):
  1921. # We need to imply </option> if <option> is the current node.
  1922. if self.tree.openElements[-1].name == "option":
  1923. self.tree.openElements.pop()
  1924. self.tree.insertElement(token)
  1925. def startTagOptgroup(self, token):
  1926. if self.tree.openElements[-1].name == "option":
  1927. self.tree.openElements.pop()
  1928. if self.tree.openElements[-1].name == "optgroup":
  1929. self.tree.openElements.pop()
  1930. self.tree.insertElement(token)
  1931. def startTagSelect(self, token):
  1932. self.parser.parseError("unexpected-select-in-select")
  1933. self.endTagSelect(impliedTagToken("select"))
  1934. def startTagInput(self, token):
  1935. self.parser.parseError("unexpected-input-in-select")
  1936. if self.tree.elementInScope("select", variant="select"):
  1937. self.endTagSelect(impliedTagToken("select"))
  1938. return token
  1939. else:
  1940. assert self.parser.innerHTML
  1941. def startTagScript(self, token):
  1942. return self.parser.phases["inHead"].processStartTag(token)
  1943. def startTagOther(self, token):
  1944. self.parser.parseError("unexpected-start-tag-in-select",
  1945. {"name": token["name"]})
  1946. def endTagOption(self, token):
  1947. if self.tree.openElements[-1].name == "option":
  1948. self.tree.openElements.pop()
  1949. else:
  1950. self.parser.parseError("unexpected-end-tag-in-select",
  1951. {"name": "option"})
  1952. def endTagOptgroup(self, token):
  1953. # </optgroup> implicitly closes <option>
  1954. if (self.tree.openElements[-1].name == "option" and
  1955. self.tree.openElements[-2].name == "optgroup"):
  1956. self.tree.openElements.pop()
  1957. # It also closes </optgroup>
  1958. if self.tree.openElements[-1].name == "optgroup":
  1959. self.tree.openElements.pop()
  1960. # But nothing else
  1961. else:
  1962. self.parser.parseError("unexpected-end-tag-in-select",
  1963. {"name": "optgroup"})
  1964. def endTagSelect(self, token):
  1965. if self.tree.elementInScope("select", variant="select"):
  1966. node = self.tree.openElements.pop()
  1967. while node.name != "select":
  1968. node = self.tree.openElements.pop()
  1969. self.parser.resetInsertionMode()
  1970. else:
  1971. # innerHTML case
  1972. assert self.parser.innerHTML
  1973. self.parser.parseError()
  1974. def endTagOther(self, token):
  1975. self.parser.parseError("unexpected-end-tag-in-select",
  1976. {"name": token["name"]})
  1977. class InSelectInTablePhase(Phase):
  1978. def __init__(self, parser, tree):
  1979. Phase.__init__(self, parser, tree)
  1980. self.startTagHandler = utils.MethodDispatcher([
  1981. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  1982. self.startTagTable)
  1983. ])
  1984. self.startTagHandler.default = self.startTagOther
  1985. self.endTagHandler = utils.MethodDispatcher([
  1986. (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
  1987. self.endTagTable)
  1988. ])
  1989. self.endTagHandler.default = self.endTagOther
  1990. def processEOF(self):
  1991. self.parser.phases["inSelect"].processEOF()
  1992. def processCharacters(self, token):
  1993. return self.parser.phases["inSelect"].processCharacters(token)
  1994. def startTagTable(self, token):
  1995. self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
  1996. self.endTagOther(impliedTagToken("select"))
  1997. return token
  1998. def startTagOther(self, token):
  1999. return self.parser.phases["inSelect"].processStartTag(token)
  2000. def endTagTable(self, token):
  2001. self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
  2002. if self.tree.elementInScope(token["name"], variant="table"):
  2003. self.endTagOther(impliedTagToken("select"))
  2004. return token
  2005. def endTagOther(self, token):
  2006. return self.parser.phases["inSelect"].processEndTag(token)
  2007. class InForeignContentPhase(Phase):
  2008. breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
  2009. "center", "code", "dd", "div", "dl", "dt",
  2010. "em", "embed", "h1", "h2", "h3",
  2011. "h4", "h5", "h6", "head", "hr", "i", "img",
  2012. "li", "listing", "menu", "meta", "nobr",
  2013. "ol", "p", "pre", "ruby", "s", "small",
  2014. "span", "strong", "strike", "sub", "sup",
  2015. "table", "tt", "u", "ul", "var"])
  2016. def __init__(self, parser, tree):
  2017. Phase.__init__(self, parser, tree)
  2018. def adjustSVGTagNames(self, token):
  2019. replacements = {"altglyph": "altGlyph",
  2020. "altglyphdef": "altGlyphDef",
  2021. "altglyphitem": "altGlyphItem",
  2022. "animatecolor": "animateColor",
  2023. "animatemotion": "animateMotion",
  2024. "animatetransform": "animateTransform",
  2025. "clippath": "clipPath",
  2026. "feblend": "feBlend",
  2027. "fecolormatrix": "feColorMatrix",
  2028. "fecomponenttransfer": "feComponentTransfer",
  2029. "fecomposite": "feComposite",
  2030. "feconvolvematrix": "feConvolveMatrix",
  2031. "fediffuselighting": "feDiffuseLighting",
  2032. "fedisplacementmap": "feDisplacementMap",
  2033. "fedistantlight": "feDistantLight",
  2034. "feflood": "feFlood",
  2035. "fefunca": "feFuncA",
  2036. "fefuncb": "feFuncB",
  2037. "fefuncg": "feFuncG",
  2038. "fefuncr": "feFuncR",
  2039. "fegaussianblur": "feGaussianBlur",
  2040. "feimage": "feImage",
  2041. "femerge": "feMerge",
  2042. "femergenode": "feMergeNode",
  2043. "femorphology": "feMorphology",
  2044. "feoffset": "feOffset",
  2045. "fepointlight": "fePointLight",
  2046. "fespecularlighting": "feSpecularLighting",
  2047. "fespotlight": "feSpotLight",
  2048. "fetile": "feTile",
  2049. "feturbulence": "feTurbulence",
  2050. "foreignobject": "foreignObject",
  2051. "glyphref": "glyphRef",
  2052. "lineargradient": "linearGradient",
  2053. "radialgradient": "radialGradient",
  2054. "textpath": "textPath"}
  2055. if token["name"] in replacements:
  2056. token["name"] = replacements[token["name"]]
  2057. def processCharacters(self, token):
  2058. if token["data"] == "\u0000":
  2059. token["data"] = "\uFFFD"
  2060. elif (self.parser.framesetOK and
  2061. any(char not in spaceCharacters for char in token["data"])):
  2062. self.parser.framesetOK = False
  2063. Phase.processCharacters(self, token)
  2064. def processStartTag(self, token):
  2065. currentNode = self.tree.openElements[-1]
  2066. if (token["name"] in self.breakoutElements or
  2067. (token["name"] == "font" and
  2068. set(token["data"].keys()) & set(["color", "face", "size"]))):
  2069. self.parser.parseError("unexpected-html-element-in-foreign-content",
  2070. {"name": token["name"]})
  2071. while (self.tree.openElements[-1].namespace !=
  2072. self.tree.defaultNamespace and
  2073. not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
  2074. not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
  2075. self.tree.openElements.pop()
  2076. return token
  2077. else:
  2078. if currentNode.namespace == namespaces["mathml"]:
  2079. self.parser.adjustMathMLAttributes(token)
  2080. elif currentNode.namespace == namespaces["svg"]:
  2081. self.adjustSVGTagNames(token)
  2082. self.parser.adjustSVGAttributes(token)
  2083. self.parser.adjustForeignAttributes(token)
  2084. token["namespace"] = currentNode.namespace
  2085. self.tree.insertElement(token)
  2086. if token["selfClosing"]:
  2087. self.tree.openElements.pop()
  2088. token["selfClosingAcknowledged"] = True
  2089. def processEndTag(self, token):
  2090. nodeIndex = len(self.tree.openElements) - 1
  2091. node = self.tree.openElements[-1]
  2092. if node.name != token["name"]:
  2093. self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
  2094. while True:
  2095. if node.name.translate(asciiUpper2Lower) == token["name"]:
  2096. # XXX this isn't in the spec but it seems necessary
  2097. if self.parser.phase == self.parser.phases["inTableText"]:
  2098. self.parser.phase.flushCharacters()
  2099. self.parser.phase = self.parser.phase.originalPhase
  2100. while self.tree.openElements.pop() != node:
  2101. assert self.tree.openElements
  2102. new_token = None
  2103. break
  2104. nodeIndex -= 1
  2105. node = self.tree.openElements[nodeIndex]
  2106. if node.namespace != self.tree.defaultNamespace:
  2107. continue
  2108. else:
  2109. new_token = self.parser.phase.processEndTag(token)
  2110. break
  2111. return new_token
  2112. class AfterBodyPhase(Phase):
  2113. def __init__(self, parser, tree):
  2114. Phase.__init__(self, parser, tree)
  2115. self.startTagHandler = utils.MethodDispatcher([
  2116. ("html", self.startTagHtml)
  2117. ])
  2118. self.startTagHandler.default = self.startTagOther
  2119. self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
  2120. self.endTagHandler.default = self.endTagOther
  2121. def processEOF(self):
  2122. # Stop parsing
  2123. pass
  2124. def processComment(self, token):
  2125. # This is needed because data is to be appended to the <html> element
  2126. # here and not to whatever is currently open.
  2127. self.tree.insertComment(token, self.tree.openElements[0])
  2128. def processCharacters(self, token):
  2129. self.parser.parseError("unexpected-char-after-body")
  2130. self.parser.phase = self.parser.phases["inBody"]
  2131. return token
  2132. def startTagHtml(self, token):
  2133. return self.parser.phases["inBody"].processStartTag(token)
  2134. def startTagOther(self, token):
  2135. self.parser.parseError("unexpected-start-tag-after-body",
  2136. {"name": token["name"]})
  2137. self.parser.phase = self.parser.phases["inBody"]
  2138. return token
  2139. def endTagHtml(self, name):
  2140. if self.parser.innerHTML:
  2141. self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
  2142. else:
  2143. self.parser.phase = self.parser.phases["afterAfterBody"]
  2144. def endTagOther(self, token):
  2145. self.parser.parseError("unexpected-end-tag-after-body",
  2146. {"name": token["name"]})
  2147. self.parser.phase = self.parser.phases["inBody"]
  2148. return token
  2149. class InFramesetPhase(Phase):
  2150. # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
  2151. def __init__(self, parser, tree):
  2152. Phase.__init__(self, parser, tree)
  2153. self.startTagHandler = utils.MethodDispatcher([
  2154. ("html", self.startTagHtml),
  2155. ("frameset", self.startTagFrameset),
  2156. ("frame", self.startTagFrame),
  2157. ("noframes", self.startTagNoframes)
  2158. ])
  2159. self.startTagHandler.default = self.startTagOther
  2160. self.endTagHandler = utils.MethodDispatcher([
  2161. ("frameset", self.endTagFrameset)
  2162. ])
  2163. self.endTagHandler.default = self.endTagOther
  2164. def processEOF(self):
  2165. if self.tree.openElements[-1].name != "html":
  2166. self.parser.parseError("eof-in-frameset")
  2167. else:
  2168. assert self.parser.innerHTML
  2169. def processCharacters(self, token):
  2170. self.parser.parseError("unexpected-char-in-frameset")
  2171. def startTagFrameset(self, token):
  2172. self.tree.insertElement(token)
  2173. def startTagFrame(self, token):
  2174. self.tree.insertElement(token)
  2175. self.tree.openElements.pop()
  2176. def startTagNoframes(self, token):
  2177. return self.parser.phases["inBody"].processStartTag(token)
  2178. def startTagOther(self, token):
  2179. self.parser.parseError("unexpected-start-tag-in-frameset",
  2180. {"name": token["name"]})
  2181. def endTagFrameset(self, token):
  2182. if self.tree.openElements[-1].name == "html":
  2183. # innerHTML case
  2184. self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
  2185. else:
  2186. self.tree.openElements.pop()
  2187. if (not self.parser.innerHTML and
  2188. self.tree.openElements[-1].name != "frameset"):
  2189. # If we're not in innerHTML mode and the the current node is not a
  2190. # "frameset" element (anymore) then switch.
  2191. self.parser.phase = self.parser.phases["afterFrameset"]
  2192. def endTagOther(self, token):
  2193. self.parser.parseError("unexpected-end-tag-in-frameset",
  2194. {"name": token["name"]})
  2195. class AfterFramesetPhase(Phase):
  2196. # http://www.whatwg.org/specs/web-apps/current-work/#after3
  2197. def __init__(self, parser, tree):
  2198. Phase.__init__(self, parser, tree)
  2199. self.startTagHandler = utils.MethodDispatcher([
  2200. ("html", self.startTagHtml),
  2201. ("noframes", self.startTagNoframes)
  2202. ])
  2203. self.startTagHandler.default = self.startTagOther
  2204. self.endTagHandler = utils.MethodDispatcher([
  2205. ("html", self.endTagHtml)
  2206. ])
  2207. self.endTagHandler.default = self.endTagOther
  2208. def processEOF(self):
  2209. # Stop parsing
  2210. pass
  2211. def processCharacters(self, token):
  2212. self.parser.parseError("unexpected-char-after-frameset")
  2213. def startTagNoframes(self, token):
  2214. return self.parser.phases["inHead"].processStartTag(token)
  2215. def startTagOther(self, token):
  2216. self.parser.parseError("unexpected-start-tag-after-frameset",
  2217. {"name": token["name"]})
  2218. def endTagHtml(self, token):
  2219. self.parser.phase = self.parser.phases["afterAfterFrameset"]
  2220. def endTagOther(self, token):
  2221. self.parser.parseError("unexpected-end-tag-after-frameset",
  2222. {"name": token["name"]})
  2223. class AfterAfterBodyPhase(Phase):
  2224. def __init__(self, parser, tree):
  2225. Phase.__init__(self, parser, tree)
  2226. self.startTagHandler = utils.MethodDispatcher([
  2227. ("html", self.startTagHtml)
  2228. ])
  2229. self.startTagHandler.default = self.startTagOther
  2230. def processEOF(self):
  2231. pass
  2232. def processComment(self, token):
  2233. self.tree.insertComment(token, self.tree.document)
  2234. def processSpaceCharacters(self, token):
  2235. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2236. def processCharacters(self, token):
  2237. self.parser.parseError("expected-eof-but-got-char")
  2238. self.parser.phase = self.parser.phases["inBody"]
  2239. return token
  2240. def startTagHtml(self, token):
  2241. return self.parser.phases["inBody"].processStartTag(token)
  2242. def startTagOther(self, token):
  2243. self.parser.parseError("expected-eof-but-got-start-tag",
  2244. {"name": token["name"]})
  2245. self.parser.phase = self.parser.phases["inBody"]
  2246. return token
  2247. def processEndTag(self, token):
  2248. self.parser.parseError("expected-eof-but-got-end-tag",
  2249. {"name": token["name"]})
  2250. self.parser.phase = self.parser.phases["inBody"]
  2251. return token
  2252. class AfterAfterFramesetPhase(Phase):
  2253. def __init__(self, parser, tree):
  2254. Phase.__init__(self, parser, tree)
  2255. self.startTagHandler = utils.MethodDispatcher([
  2256. ("html", self.startTagHtml),
  2257. ("noframes", self.startTagNoFrames)
  2258. ])
  2259. self.startTagHandler.default = self.startTagOther
  2260. def processEOF(self):
  2261. pass
  2262. def processComment(self, token):
  2263. self.tree.insertComment(token, self.tree.document)
  2264. def processSpaceCharacters(self, token):
  2265. return self.parser.phases["inBody"].processSpaceCharacters(token)
  2266. def processCharacters(self, token):
  2267. self.parser.parseError("expected-eof-but-got-char")
  2268. def startTagHtml(self, token):
  2269. return self.parser.phases["inBody"].processStartTag(token)
  2270. def startTagNoFrames(self, token):
  2271. return self.parser.phases["inHead"].processStartTag(token)
  2272. def startTagOther(self, token):
  2273. self.parser.parseError("expected-eof-but-got-start-tag",
  2274. {"name": token["name"]})
  2275. def processEndTag(self, token):
  2276. self.parser.parseError("expected-eof-but-got-end-tag",
  2277. {"name": token["name"]})
  2278. return {
  2279. "initial": InitialPhase,
  2280. "beforeHtml": BeforeHtmlPhase,
  2281. "beforeHead": BeforeHeadPhase,
  2282. "inHead": InHeadPhase,
  2283. # XXX "inHeadNoscript": InHeadNoScriptPhase,
  2284. "afterHead": AfterHeadPhase,
  2285. "inBody": InBodyPhase,
  2286. "text": TextPhase,
  2287. "inTable": InTablePhase,
  2288. "inTableText": InTableTextPhase,
  2289. "inCaption": InCaptionPhase,
  2290. "inColumnGroup": InColumnGroupPhase,
  2291. "inTableBody": InTableBodyPhase,
  2292. "inRow": InRowPhase,
  2293. "inCell": InCellPhase,
  2294. "inSelect": InSelectPhase,
  2295. "inSelectInTable": InSelectInTablePhase,
  2296. "inForeignContent": InForeignContentPhase,
  2297. "afterBody": AfterBodyPhase,
  2298. "inFrameset": InFramesetPhase,
  2299. "afterFrameset": AfterFramesetPhase,
  2300. "afterAfterBody": AfterAfterBodyPhase,
  2301. "afterAfterFrameset": AfterAfterFramesetPhase,
  2302. # XXX after after frameset
  2303. }
  2304. def impliedTagToken(name, type="EndTag", attributes=None,
  2305. selfClosing=False):
  2306. if attributes is None:
  2307. attributes = {}
  2308. return {"type": tokenTypes[type], "name": name, "data": attributes,
  2309. "selfClosing": selfClosing}
  2310. class ParseError(Exception):
  2311. """Error in parsed document"""
  2312. pass