You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1731 lines
75 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. from __future__ import absolute_import, division, unicode_literals
  2. try:
  3. chr = unichr # flake8: noqa
  4. except NameError:
  5. pass
  6. from collections import deque
  7. from .constants import spaceCharacters
  8. from .constants import entities
  9. from .constants import asciiLetters, asciiUpper2Lower
  10. from .constants import digits, hexDigits, EOF
  11. from .constants import tokenTypes, tagTokenTypes
  12. from .constants import replacementCharacters
  13. from .inputstream import HTMLInputStream
  14. from .trie import Trie
  15. entitiesTrie = Trie(entities)
  16. class HTMLTokenizer(object):
  17. """ This class takes care of tokenizing HTML.
  18. * self.currentToken
  19. Holds the token that is currently being processed.
  20. * self.state
  21. Holds a reference to the method to be invoked... XXX
  22. * self.stream
  23. Points to HTMLInputStream object.
  24. """
  25. def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
  26. lowercaseElementName=True, lowercaseAttrName=True, parser=None):
  27. self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
  28. self.parser = parser
  29. # Perform case conversions?
  30. self.lowercaseElementName = lowercaseElementName
  31. self.lowercaseAttrName = lowercaseAttrName
  32. # Setup the initial tokenizer state
  33. self.escapeFlag = False
  34. self.lastFourChars = []
  35. self.state = self.dataState
  36. self.escape = False
  37. # The current token being created
  38. self.currentToken = None
  39. super(HTMLTokenizer, self).__init__()
  40. def __iter__(self):
  41. """ This is where the magic happens.
  42. We do our usually processing through the states and when we have a token
  43. to return we yield the token which pauses processing until the next token
  44. is requested.
  45. """
  46. self.tokenQueue = deque([])
  47. # Start processing. When EOF is reached self.state will return False
  48. # instead of True and the loop will terminate.
  49. while self.state():
  50. while self.stream.errors:
  51. yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
  52. while self.tokenQueue:
  53. yield self.tokenQueue.popleft()
  54. def consumeNumberEntity(self, isHex):
  55. """This function returns either U+FFFD or the character based on the
  56. decimal or hexadecimal representation. It also discards ";" if present.
  57. If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
  58. """
  59. allowed = digits
  60. radix = 10
  61. if isHex:
  62. allowed = hexDigits
  63. radix = 16
  64. charStack = []
  65. # Consume all the characters that are in range while making sure we
  66. # don't hit an EOF.
  67. c = self.stream.char()
  68. while c in allowed and c is not EOF:
  69. charStack.append(c)
  70. c = self.stream.char()
  71. # Convert the set of characters consumed to an int.
  72. charAsInt = int("".join(charStack), radix)
  73. # Certain characters get replaced with others
  74. if charAsInt in replacementCharacters:
  75. char = replacementCharacters[charAsInt]
  76. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  77. "illegal-codepoint-for-numeric-entity",
  78. "datavars": {"charAsInt": charAsInt}})
  79. elif ((0xD800 <= charAsInt <= 0xDFFF) or
  80. (charAsInt > 0x10FFFF)):
  81. char = "\uFFFD"
  82. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  83. "illegal-codepoint-for-numeric-entity",
  84. "datavars": {"charAsInt": charAsInt}})
  85. else:
  86. # Should speed up this check somehow (e.g. move the set to a constant)
  87. if ((0x0001 <= charAsInt <= 0x0008) or
  88. (0x000E <= charAsInt <= 0x001F) or
  89. (0x007F <= charAsInt <= 0x009F) or
  90. (0xFDD0 <= charAsInt <= 0xFDEF) or
  91. charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
  92. 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  93. 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
  94. 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
  95. 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
  96. 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
  97. 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  98. 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
  99. 0xFFFFF, 0x10FFFE, 0x10FFFF])):
  100. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  101. "data":
  102. "illegal-codepoint-for-numeric-entity",
  103. "datavars": {"charAsInt": charAsInt}})
  104. try:
  105. # Try/except needed as UCS-2 Python builds' unichar only works
  106. # within the BMP.
  107. char = chr(charAsInt)
  108. except ValueError:
  109. v = charAsInt - 0x10000
  110. char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
  111. # Discard the ; if present. Otherwise, put it back on the queue and
  112. # invoke parseError on parser.
  113. if c != ";":
  114. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  115. "numeric-entity-without-semicolon"})
  116. self.stream.unget(c)
  117. return char
  118. def consumeEntity(self, allowedChar=None, fromAttribute=False):
  119. # Initialise to the default output for when no entity is matched
  120. output = "&"
  121. charStack = [self.stream.char()]
  122. if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
  123. or (allowedChar is not None and allowedChar == charStack[0])):
  124. self.stream.unget(charStack[0])
  125. elif charStack[0] == "#":
  126. # Read the next character to see if it's hex or decimal
  127. hex = False
  128. charStack.append(self.stream.char())
  129. if charStack[-1] in ("x", "X"):
  130. hex = True
  131. charStack.append(self.stream.char())
  132. # charStack[-1] should be the first digit
  133. if (hex and charStack[-1] in hexDigits) \
  134. or (not hex and charStack[-1] in digits):
  135. # At least one digit found, so consume the whole number
  136. self.stream.unget(charStack[-1])
  137. output = self.consumeNumberEntity(hex)
  138. else:
  139. # No digits found
  140. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  141. "data": "expected-numeric-entity"})
  142. self.stream.unget(charStack.pop())
  143. output = "&" + "".join(charStack)
  144. else:
  145. # At this point in the process might have named entity. Entities
  146. # are stored in the global variable "entities".
  147. #
  148. # Consume characters and compare to these to a substring of the
  149. # entity names in the list until the substring no longer matches.
  150. while (charStack[-1] is not EOF):
  151. if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
  152. break
  153. charStack.append(self.stream.char())
  154. # At this point we have a string that starts with some characters
  155. # that may match an entity
  156. # Try to find the longest entity the string will match to take care
  157. # of &noti for instance.
  158. try:
  159. entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
  160. entityLength = len(entityName)
  161. except KeyError:
  162. entityName = None
  163. if entityName is not None:
  164. if entityName[-1] != ";":
  165. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  166. "named-entity-without-semicolon"})
  167. if (entityName[-1] != ";" and fromAttribute and
  168. (charStack[entityLength] in asciiLetters or
  169. charStack[entityLength] in digits or
  170. charStack[entityLength] == "=")):
  171. self.stream.unget(charStack.pop())
  172. output = "&" + "".join(charStack)
  173. else:
  174. output = entities[entityName]
  175. self.stream.unget(charStack.pop())
  176. output += "".join(charStack[entityLength:])
  177. else:
  178. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  179. "expected-named-entity"})
  180. self.stream.unget(charStack.pop())
  181. output = "&" + "".join(charStack)
  182. if fromAttribute:
  183. self.currentToken["data"][-1][1] += output
  184. else:
  185. if output in spaceCharacters:
  186. tokenType = "SpaceCharacters"
  187. else:
  188. tokenType = "Characters"
  189. self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
  190. def processEntityInAttribute(self, allowedChar):
  191. """This method replaces the need for "entityInAttributeValueState".
  192. """
  193. self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
  194. def emitCurrentToken(self):
  195. """This method is a generic handler for emitting the tags. It also sets
  196. the state to "data" because that's what's needed after a token has been
  197. emitted.
  198. """
  199. token = self.currentToken
  200. # Add token to the queue to be yielded
  201. if (token["type"] in tagTokenTypes):
  202. if self.lowercaseElementName:
  203. token["name"] = token["name"].translate(asciiUpper2Lower)
  204. if token["type"] == tokenTypes["EndTag"]:
  205. if token["data"]:
  206. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  207. "data": "attributes-in-end-tag"})
  208. if token["selfClosing"]:
  209. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  210. "data": "self-closing-flag-on-end-tag"})
  211. self.tokenQueue.append(token)
  212. self.state = self.dataState
  213. # Below are the various tokenizer states worked out.
  214. def dataState(self):
  215. data = self.stream.char()
  216. if data == "&":
  217. self.state = self.entityDataState
  218. elif data == "<":
  219. self.state = self.tagOpenState
  220. elif data == "\u0000":
  221. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  222. "data": "invalid-codepoint"})
  223. self.tokenQueue.append({"type": tokenTypes["Characters"],
  224. "data": "\u0000"})
  225. elif data is EOF:
  226. # Tokenization ends.
  227. return False
  228. elif data in spaceCharacters:
  229. # Directly after emitting a token you switch back to the "data
  230. # state". At that point spaceCharacters are important so they are
  231. # emitted separately.
  232. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
  233. data + self.stream.charsUntil(spaceCharacters, True)})
  234. # No need to update lastFourChars here, since the first space will
  235. # have already been appended to lastFourChars and will have broken
  236. # any <!-- or --> sequences
  237. else:
  238. chars = self.stream.charsUntil(("&", "<", "\u0000"))
  239. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  240. data + chars})
  241. return True
  242. def entityDataState(self):
  243. self.consumeEntity()
  244. self.state = self.dataState
  245. return True
  246. def rcdataState(self):
  247. data = self.stream.char()
  248. if data == "&":
  249. self.state = self.characterReferenceInRcdata
  250. elif data == "<":
  251. self.state = self.rcdataLessThanSignState
  252. elif data == EOF:
  253. # Tokenization ends.
  254. return False
  255. elif data == "\u0000":
  256. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  257. "data": "invalid-codepoint"})
  258. self.tokenQueue.append({"type": tokenTypes["Characters"],
  259. "data": "\uFFFD"})
  260. elif data in spaceCharacters:
  261. # Directly after emitting a token you switch back to the "data
  262. # state". At that point spaceCharacters are important so they are
  263. # emitted separately.
  264. self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
  265. data + self.stream.charsUntil(spaceCharacters, True)})
  266. # No need to update lastFourChars here, since the first space will
  267. # have already been appended to lastFourChars and will have broken
  268. # any <!-- or --> sequences
  269. else:
  270. chars = self.stream.charsUntil(("&", "<", "\u0000"))
  271. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  272. data + chars})
  273. return True
  274. def characterReferenceInRcdata(self):
  275. self.consumeEntity()
  276. self.state = self.rcdataState
  277. return True
  278. def rawtextState(self):
  279. data = self.stream.char()
  280. if data == "<":
  281. self.state = self.rawtextLessThanSignState
  282. elif data == "\u0000":
  283. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  284. "data": "invalid-codepoint"})
  285. self.tokenQueue.append({"type": tokenTypes["Characters"],
  286. "data": "\uFFFD"})
  287. elif data == EOF:
  288. # Tokenization ends.
  289. return False
  290. else:
  291. chars = self.stream.charsUntil(("<", "\u0000"))
  292. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  293. data + chars})
  294. return True
  295. def scriptDataState(self):
  296. data = self.stream.char()
  297. if data == "<":
  298. self.state = self.scriptDataLessThanSignState
  299. elif data == "\u0000":
  300. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  301. "data": "invalid-codepoint"})
  302. self.tokenQueue.append({"type": tokenTypes["Characters"],
  303. "data": "\uFFFD"})
  304. elif data == EOF:
  305. # Tokenization ends.
  306. return False
  307. else:
  308. chars = self.stream.charsUntil(("<", "\u0000"))
  309. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  310. data + chars})
  311. return True
  312. def plaintextState(self):
  313. data = self.stream.char()
  314. if data == EOF:
  315. # Tokenization ends.
  316. return False
  317. elif data == "\u0000":
  318. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  319. "data": "invalid-codepoint"})
  320. self.tokenQueue.append({"type": tokenTypes["Characters"],
  321. "data": "\uFFFD"})
  322. else:
  323. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  324. data + self.stream.charsUntil("\u0000")})
  325. return True
  326. def tagOpenState(self):
  327. data = self.stream.char()
  328. if data == "!":
  329. self.state = self.markupDeclarationOpenState
  330. elif data == "/":
  331. self.state = self.closeTagOpenState
  332. elif data in asciiLetters:
  333. self.currentToken = {"type": tokenTypes["StartTag"],
  334. "name": data, "data": [],
  335. "selfClosing": False,
  336. "selfClosingAcknowledged": False}
  337. self.state = self.tagNameState
  338. elif data == ">":
  339. # XXX In theory it could be something besides a tag name. But
  340. # do we really care?
  341. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  342. "expected-tag-name-but-got-right-bracket"})
  343. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
  344. self.state = self.dataState
  345. elif data == "?":
  346. # XXX In theory it could be something besides a tag name. But
  347. # do we really care?
  348. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  349. "expected-tag-name-but-got-question-mark"})
  350. self.stream.unget(data)
  351. self.state = self.bogusCommentState
  352. else:
  353. # XXX
  354. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  355. "expected-tag-name"})
  356. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  357. self.stream.unget(data)
  358. self.state = self.dataState
  359. return True
  360. def closeTagOpenState(self):
  361. data = self.stream.char()
  362. if data in asciiLetters:
  363. self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
  364. "data": [], "selfClosing": False}
  365. self.state = self.tagNameState
  366. elif data == ">":
  367. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  368. "expected-closing-tag-but-got-right-bracket"})
  369. self.state = self.dataState
  370. elif data is EOF:
  371. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  372. "expected-closing-tag-but-got-eof"})
  373. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  374. self.state = self.dataState
  375. else:
  376. # XXX data can be _'_...
  377. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  378. "expected-closing-tag-but-got-char",
  379. "datavars": {"data": data}})
  380. self.stream.unget(data)
  381. self.state = self.bogusCommentState
  382. return True
  383. def tagNameState(self):
  384. data = self.stream.char()
  385. if data in spaceCharacters:
  386. self.state = self.beforeAttributeNameState
  387. elif data == ">":
  388. self.emitCurrentToken()
  389. elif data is EOF:
  390. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  391. "eof-in-tag-name"})
  392. self.state = self.dataState
  393. elif data == "/":
  394. self.state = self.selfClosingStartTagState
  395. elif data == "\u0000":
  396. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  397. "data": "invalid-codepoint"})
  398. self.currentToken["name"] += "\uFFFD"
  399. else:
  400. self.currentToken["name"] += data
  401. # (Don't use charsUntil here, because tag names are
  402. # very short and it's faster to not do anything fancy)
  403. return True
  404. def rcdataLessThanSignState(self):
  405. data = self.stream.char()
  406. if data == "/":
  407. self.temporaryBuffer = ""
  408. self.state = self.rcdataEndTagOpenState
  409. else:
  410. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  411. self.stream.unget(data)
  412. self.state = self.rcdataState
  413. return True
  414. def rcdataEndTagOpenState(self):
  415. data = self.stream.char()
  416. if data in asciiLetters:
  417. self.temporaryBuffer += data
  418. self.state = self.rcdataEndTagNameState
  419. else:
  420. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  421. self.stream.unget(data)
  422. self.state = self.rcdataState
  423. return True
  424. def rcdataEndTagNameState(self):
  425. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  426. data = self.stream.char()
  427. if data in spaceCharacters and appropriate:
  428. self.currentToken = {"type": tokenTypes["EndTag"],
  429. "name": self.temporaryBuffer,
  430. "data": [], "selfClosing": False}
  431. self.state = self.beforeAttributeNameState
  432. elif data == "/" and appropriate:
  433. self.currentToken = {"type": tokenTypes["EndTag"],
  434. "name": self.temporaryBuffer,
  435. "data": [], "selfClosing": False}
  436. self.state = self.selfClosingStartTagState
  437. elif data == ">" and appropriate:
  438. self.currentToken = {"type": tokenTypes["EndTag"],
  439. "name": self.temporaryBuffer,
  440. "data": [], "selfClosing": False}
  441. self.emitCurrentToken()
  442. self.state = self.dataState
  443. elif data in asciiLetters:
  444. self.temporaryBuffer += data
  445. else:
  446. self.tokenQueue.append({"type": tokenTypes["Characters"],
  447. "data": "</" + self.temporaryBuffer})
  448. self.stream.unget(data)
  449. self.state = self.rcdataState
  450. return True
  451. def rawtextLessThanSignState(self):
  452. data = self.stream.char()
  453. if data == "/":
  454. self.temporaryBuffer = ""
  455. self.state = self.rawtextEndTagOpenState
  456. else:
  457. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  458. self.stream.unget(data)
  459. self.state = self.rawtextState
  460. return True
  461. def rawtextEndTagOpenState(self):
  462. data = self.stream.char()
  463. if data in asciiLetters:
  464. self.temporaryBuffer += data
  465. self.state = self.rawtextEndTagNameState
  466. else:
  467. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  468. self.stream.unget(data)
  469. self.state = self.rawtextState
  470. return True
  471. def rawtextEndTagNameState(self):
  472. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  473. data = self.stream.char()
  474. if data in spaceCharacters and appropriate:
  475. self.currentToken = {"type": tokenTypes["EndTag"],
  476. "name": self.temporaryBuffer,
  477. "data": [], "selfClosing": False}
  478. self.state = self.beforeAttributeNameState
  479. elif data == "/" and appropriate:
  480. self.currentToken = {"type": tokenTypes["EndTag"],
  481. "name": self.temporaryBuffer,
  482. "data": [], "selfClosing": False}
  483. self.state = self.selfClosingStartTagState
  484. elif data == ">" and appropriate:
  485. self.currentToken = {"type": tokenTypes["EndTag"],
  486. "name": self.temporaryBuffer,
  487. "data": [], "selfClosing": False}
  488. self.emitCurrentToken()
  489. self.state = self.dataState
  490. elif data in asciiLetters:
  491. self.temporaryBuffer += data
  492. else:
  493. self.tokenQueue.append({"type": tokenTypes["Characters"],
  494. "data": "</" + self.temporaryBuffer})
  495. self.stream.unget(data)
  496. self.state = self.rawtextState
  497. return True
  498. def scriptDataLessThanSignState(self):
  499. data = self.stream.char()
  500. if data == "/":
  501. self.temporaryBuffer = ""
  502. self.state = self.scriptDataEndTagOpenState
  503. elif data == "!":
  504. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
  505. self.state = self.scriptDataEscapeStartState
  506. else:
  507. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  508. self.stream.unget(data)
  509. self.state = self.scriptDataState
  510. return True
  511. def scriptDataEndTagOpenState(self):
  512. data = self.stream.char()
  513. if data in asciiLetters:
  514. self.temporaryBuffer += data
  515. self.state = self.scriptDataEndTagNameState
  516. else:
  517. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  518. self.stream.unget(data)
  519. self.state = self.scriptDataState
  520. return True
  521. def scriptDataEndTagNameState(self):
  522. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  523. data = self.stream.char()
  524. if data in spaceCharacters and appropriate:
  525. self.currentToken = {"type": tokenTypes["EndTag"],
  526. "name": self.temporaryBuffer,
  527. "data": [], "selfClosing": False}
  528. self.state = self.beforeAttributeNameState
  529. elif data == "/" and appropriate:
  530. self.currentToken = {"type": tokenTypes["EndTag"],
  531. "name": self.temporaryBuffer,
  532. "data": [], "selfClosing": False}
  533. self.state = self.selfClosingStartTagState
  534. elif data == ">" and appropriate:
  535. self.currentToken = {"type": tokenTypes["EndTag"],
  536. "name": self.temporaryBuffer,
  537. "data": [], "selfClosing": False}
  538. self.emitCurrentToken()
  539. self.state = self.dataState
  540. elif data in asciiLetters:
  541. self.temporaryBuffer += data
  542. else:
  543. self.tokenQueue.append({"type": tokenTypes["Characters"],
  544. "data": "</" + self.temporaryBuffer})
  545. self.stream.unget(data)
  546. self.state = self.scriptDataState
  547. return True
  548. def scriptDataEscapeStartState(self):
  549. data = self.stream.char()
  550. if data == "-":
  551. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  552. self.state = self.scriptDataEscapeStartDashState
  553. else:
  554. self.stream.unget(data)
  555. self.state = self.scriptDataState
  556. return True
  557. def scriptDataEscapeStartDashState(self):
  558. data = self.stream.char()
  559. if data == "-":
  560. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  561. self.state = self.scriptDataEscapedDashDashState
  562. else:
  563. self.stream.unget(data)
  564. self.state = self.scriptDataState
  565. return True
  566. def scriptDataEscapedState(self):
  567. data = self.stream.char()
  568. if data == "-":
  569. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  570. self.state = self.scriptDataEscapedDashState
  571. elif data == "<":
  572. self.state = self.scriptDataEscapedLessThanSignState
  573. elif data == "\u0000":
  574. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  575. "data": "invalid-codepoint"})
  576. self.tokenQueue.append({"type": tokenTypes["Characters"],
  577. "data": "\uFFFD"})
  578. elif data == EOF:
  579. self.state = self.dataState
  580. else:
  581. chars = self.stream.charsUntil(("<", "-", "\u0000"))
  582. self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
  583. data + chars})
  584. return True
  585. def scriptDataEscapedDashState(self):
  586. data = self.stream.char()
  587. if data == "-":
  588. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  589. self.state = self.scriptDataEscapedDashDashState
  590. elif data == "<":
  591. self.state = self.scriptDataEscapedLessThanSignState
  592. elif data == "\u0000":
  593. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  594. "data": "invalid-codepoint"})
  595. self.tokenQueue.append({"type": tokenTypes["Characters"],
  596. "data": "\uFFFD"})
  597. self.state = self.scriptDataEscapedState
  598. elif data == EOF:
  599. self.state = self.dataState
  600. else:
  601. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  602. self.state = self.scriptDataEscapedState
  603. return True
  604. def scriptDataEscapedDashDashState(self):
  605. data = self.stream.char()
  606. if data == "-":
  607. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  608. elif data == "<":
  609. self.state = self.scriptDataEscapedLessThanSignState
  610. elif data == ">":
  611. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
  612. self.state = self.scriptDataState
  613. elif data == "\u0000":
  614. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  615. "data": "invalid-codepoint"})
  616. self.tokenQueue.append({"type": tokenTypes["Characters"],
  617. "data": "\uFFFD"})
  618. self.state = self.scriptDataEscapedState
  619. elif data == EOF:
  620. self.state = self.dataState
  621. else:
  622. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  623. self.state = self.scriptDataEscapedState
  624. return True
  625. def scriptDataEscapedLessThanSignState(self):
  626. data = self.stream.char()
  627. if data == "/":
  628. self.temporaryBuffer = ""
  629. self.state = self.scriptDataEscapedEndTagOpenState
  630. elif data in asciiLetters:
  631. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
  632. self.temporaryBuffer = data
  633. self.state = self.scriptDataDoubleEscapeStartState
  634. else:
  635. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  636. self.stream.unget(data)
  637. self.state = self.scriptDataEscapedState
  638. return True
  639. def scriptDataEscapedEndTagOpenState(self):
  640. data = self.stream.char()
  641. if data in asciiLetters:
  642. self.temporaryBuffer = data
  643. self.state = self.scriptDataEscapedEndTagNameState
  644. else:
  645. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
  646. self.stream.unget(data)
  647. self.state = self.scriptDataEscapedState
  648. return True
  649. def scriptDataEscapedEndTagNameState(self):
  650. appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
  651. data = self.stream.char()
  652. if data in spaceCharacters and appropriate:
  653. self.currentToken = {"type": tokenTypes["EndTag"],
  654. "name": self.temporaryBuffer,
  655. "data": [], "selfClosing": False}
  656. self.state = self.beforeAttributeNameState
  657. elif data == "/" and appropriate:
  658. self.currentToken = {"type": tokenTypes["EndTag"],
  659. "name": self.temporaryBuffer,
  660. "data": [], "selfClosing": False}
  661. self.state = self.selfClosingStartTagState
  662. elif data == ">" and appropriate:
  663. self.currentToken = {"type": tokenTypes["EndTag"],
  664. "name": self.temporaryBuffer,
  665. "data": [], "selfClosing": False}
  666. self.emitCurrentToken()
  667. self.state = self.dataState
  668. elif data in asciiLetters:
  669. self.temporaryBuffer += data
  670. else:
  671. self.tokenQueue.append({"type": tokenTypes["Characters"],
  672. "data": "</" + self.temporaryBuffer})
  673. self.stream.unget(data)
  674. self.state = self.scriptDataEscapedState
  675. return True
  676. def scriptDataDoubleEscapeStartState(self):
  677. data = self.stream.char()
  678. if data in (spaceCharacters | frozenset(("/", ">"))):
  679. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  680. if self.temporaryBuffer.lower() == "script":
  681. self.state = self.scriptDataDoubleEscapedState
  682. else:
  683. self.state = self.scriptDataEscapedState
  684. elif data in asciiLetters:
  685. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  686. self.temporaryBuffer += data
  687. else:
  688. self.stream.unget(data)
  689. self.state = self.scriptDataEscapedState
  690. return True
  691. def scriptDataDoubleEscapedState(self):
  692. data = self.stream.char()
  693. if data == "-":
  694. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  695. self.state = self.scriptDataDoubleEscapedDashState
  696. elif data == "<":
  697. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  698. self.state = self.scriptDataDoubleEscapedLessThanSignState
  699. elif data == "\u0000":
  700. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  701. "data": "invalid-codepoint"})
  702. self.tokenQueue.append({"type": tokenTypes["Characters"],
  703. "data": "\uFFFD"})
  704. elif data == EOF:
  705. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  706. "eof-in-script-in-script"})
  707. self.state = self.dataState
  708. else:
  709. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  710. return True
  711. def scriptDataDoubleEscapedDashState(self):
  712. data = self.stream.char()
  713. if data == "-":
  714. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  715. self.state = self.scriptDataDoubleEscapedDashDashState
  716. elif data == "<":
  717. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  718. self.state = self.scriptDataDoubleEscapedLessThanSignState
  719. elif data == "\u0000":
  720. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  721. "data": "invalid-codepoint"})
  722. self.tokenQueue.append({"type": tokenTypes["Characters"],
  723. "data": "\uFFFD"})
  724. self.state = self.scriptDataDoubleEscapedState
  725. elif data == EOF:
  726. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  727. "eof-in-script-in-script"})
  728. self.state = self.dataState
  729. else:
  730. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  731. self.state = self.scriptDataDoubleEscapedState
  732. return True
  733. def scriptDataDoubleEscapedDashDashState(self):
  734. data = self.stream.char()
  735. if data == "-":
  736. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
  737. elif data == "<":
  738. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
  739. self.state = self.scriptDataDoubleEscapedLessThanSignState
  740. elif data == ">":
  741. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
  742. self.state = self.scriptDataState
  743. elif data == "\u0000":
  744. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  745. "data": "invalid-codepoint"})
  746. self.tokenQueue.append({"type": tokenTypes["Characters"],
  747. "data": "\uFFFD"})
  748. self.state = self.scriptDataDoubleEscapedState
  749. elif data == EOF:
  750. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  751. "eof-in-script-in-script"})
  752. self.state = self.dataState
  753. else:
  754. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  755. self.state = self.scriptDataDoubleEscapedState
  756. return True
  757. def scriptDataDoubleEscapedLessThanSignState(self):
  758. data = self.stream.char()
  759. if data == "/":
  760. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
  761. self.temporaryBuffer = ""
  762. self.state = self.scriptDataDoubleEscapeEndState
  763. else:
  764. self.stream.unget(data)
  765. self.state = self.scriptDataDoubleEscapedState
  766. return True
  767. def scriptDataDoubleEscapeEndState(self):
  768. data = self.stream.char()
  769. if data in (spaceCharacters | frozenset(("/", ">"))):
  770. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  771. if self.temporaryBuffer.lower() == "script":
  772. self.state = self.scriptDataEscapedState
  773. else:
  774. self.state = self.scriptDataDoubleEscapedState
  775. elif data in asciiLetters:
  776. self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
  777. self.temporaryBuffer += data
  778. else:
  779. self.stream.unget(data)
  780. self.state = self.scriptDataDoubleEscapedState
  781. return True
  782. def beforeAttributeNameState(self):
  783. data = self.stream.char()
  784. if data in spaceCharacters:
  785. self.stream.charsUntil(spaceCharacters, True)
  786. elif data in asciiLetters:
  787. self.currentToken["data"].append([data, ""])
  788. self.state = self.attributeNameState
  789. elif data == ">":
  790. self.emitCurrentToken()
  791. elif data == "/":
  792. self.state = self.selfClosingStartTagState
  793. elif data in ("'", '"', "=", "<"):
  794. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  795. "invalid-character-in-attribute-name"})
  796. self.currentToken["data"].append([data, ""])
  797. self.state = self.attributeNameState
  798. elif data == "\u0000":
  799. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  800. "data": "invalid-codepoint"})
  801. self.currentToken["data"].append(["\uFFFD", ""])
  802. self.state = self.attributeNameState
  803. elif data is EOF:
  804. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  805. "expected-attribute-name-but-got-eof"})
  806. self.state = self.dataState
  807. else:
  808. self.currentToken["data"].append([data, ""])
  809. self.state = self.attributeNameState
  810. return True
  811. def attributeNameState(self):
  812. data = self.stream.char()
  813. leavingThisState = True
  814. emitToken = False
  815. if data == "=":
  816. self.state = self.beforeAttributeValueState
  817. elif data in asciiLetters:
  818. self.currentToken["data"][-1][0] += data +\
  819. self.stream.charsUntil(asciiLetters, True)
  820. leavingThisState = False
  821. elif data == ">":
  822. # XXX If we emit here the attributes are converted to a dict
  823. # without being checked and when the code below runs we error
  824. # because data is a dict not a list
  825. emitToken = True
  826. elif data in spaceCharacters:
  827. self.state = self.afterAttributeNameState
  828. elif data == "/":
  829. self.state = self.selfClosingStartTagState
  830. elif data == "\u0000":
  831. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  832. "data": "invalid-codepoint"})
  833. self.currentToken["data"][-1][0] += "\uFFFD"
  834. leavingThisState = False
  835. elif data in ("'", '"', "<"):
  836. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  837. "data":
  838. "invalid-character-in-attribute-name"})
  839. self.currentToken["data"][-1][0] += data
  840. leavingThisState = False
  841. elif data is EOF:
  842. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  843. "data": "eof-in-attribute-name"})
  844. self.state = self.dataState
  845. else:
  846. self.currentToken["data"][-1][0] += data
  847. leavingThisState = False
  848. if leavingThisState:
  849. # Attributes are not dropped at this stage. That happens when the
  850. # start tag token is emitted so values can still be safely appended
  851. # to attributes, but we do want to report the parse error in time.
  852. if self.lowercaseAttrName:
  853. self.currentToken["data"][-1][0] = (
  854. self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
  855. for name, value in self.currentToken["data"][:-1]:
  856. if self.currentToken["data"][-1][0] == name:
  857. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  858. "duplicate-attribute"})
  859. break
  860. # XXX Fix for above XXX
  861. if emitToken:
  862. self.emitCurrentToken()
  863. return True
  864. def afterAttributeNameState(self):
  865. data = self.stream.char()
  866. if data in spaceCharacters:
  867. self.stream.charsUntil(spaceCharacters, True)
  868. elif data == "=":
  869. self.state = self.beforeAttributeValueState
  870. elif data == ">":
  871. self.emitCurrentToken()
  872. elif data in asciiLetters:
  873. self.currentToken["data"].append([data, ""])
  874. self.state = self.attributeNameState
  875. elif data == "/":
  876. self.state = self.selfClosingStartTagState
  877. elif data == "\u0000":
  878. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  879. "data": "invalid-codepoint"})
  880. self.currentToken["data"].append(["\uFFFD", ""])
  881. self.state = self.attributeNameState
  882. elif data in ("'", '"', "<"):
  883. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  884. "invalid-character-after-attribute-name"})
  885. self.currentToken["data"].append([data, ""])
  886. self.state = self.attributeNameState
  887. elif data is EOF:
  888. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  889. "expected-end-of-tag-but-got-eof"})
  890. self.state = self.dataState
  891. else:
  892. self.currentToken["data"].append([data, ""])
  893. self.state = self.attributeNameState
  894. return True
  895. def beforeAttributeValueState(self):
  896. data = self.stream.char()
  897. if data in spaceCharacters:
  898. self.stream.charsUntil(spaceCharacters, True)
  899. elif data == "\"":
  900. self.state = self.attributeValueDoubleQuotedState
  901. elif data == "&":
  902. self.state = self.attributeValueUnQuotedState
  903. self.stream.unget(data)
  904. elif data == "'":
  905. self.state = self.attributeValueSingleQuotedState
  906. elif data == ">":
  907. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  908. "expected-attribute-value-but-got-right-bracket"})
  909. self.emitCurrentToken()
  910. elif data == "\u0000":
  911. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  912. "data": "invalid-codepoint"})
  913. self.currentToken["data"][-1][1] += "\uFFFD"
  914. self.state = self.attributeValueUnQuotedState
  915. elif data in ("=", "<", "`"):
  916. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  917. "equals-in-unquoted-attribute-value"})
  918. self.currentToken["data"][-1][1] += data
  919. self.state = self.attributeValueUnQuotedState
  920. elif data is EOF:
  921. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  922. "expected-attribute-value-but-got-eof"})
  923. self.state = self.dataState
  924. else:
  925. self.currentToken["data"][-1][1] += data
  926. self.state = self.attributeValueUnQuotedState
  927. return True
  928. def attributeValueDoubleQuotedState(self):
  929. data = self.stream.char()
  930. if data == "\"":
  931. self.state = self.afterAttributeValueState
  932. elif data == "&":
  933. self.processEntityInAttribute('"')
  934. elif data == "\u0000":
  935. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  936. "data": "invalid-codepoint"})
  937. self.currentToken["data"][-1][1] += "\uFFFD"
  938. elif data is EOF:
  939. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  940. "eof-in-attribute-value-double-quote"})
  941. self.state = self.dataState
  942. else:
  943. self.currentToken["data"][-1][1] += data +\
  944. self.stream.charsUntil(("\"", "&", "\u0000"))
  945. return True
  946. def attributeValueSingleQuotedState(self):
  947. data = self.stream.char()
  948. if data == "'":
  949. self.state = self.afterAttributeValueState
  950. elif data == "&":
  951. self.processEntityInAttribute("'")
  952. elif data == "\u0000":
  953. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  954. "data": "invalid-codepoint"})
  955. self.currentToken["data"][-1][1] += "\uFFFD"
  956. elif data is EOF:
  957. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  958. "eof-in-attribute-value-single-quote"})
  959. self.state = self.dataState
  960. else:
  961. self.currentToken["data"][-1][1] += data +\
  962. self.stream.charsUntil(("'", "&", "\u0000"))
  963. return True
  964. def attributeValueUnQuotedState(self):
  965. data = self.stream.char()
  966. if data in spaceCharacters:
  967. self.state = self.beforeAttributeNameState
  968. elif data == "&":
  969. self.processEntityInAttribute(">")
  970. elif data == ">":
  971. self.emitCurrentToken()
  972. elif data in ('"', "'", "=", "<", "`"):
  973. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  974. "unexpected-character-in-unquoted-attribute-value"})
  975. self.currentToken["data"][-1][1] += data
  976. elif data == "\u0000":
  977. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  978. "data": "invalid-codepoint"})
  979. self.currentToken["data"][-1][1] += "\uFFFD"
  980. elif data is EOF:
  981. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  982. "eof-in-attribute-value-no-quotes"})
  983. self.state = self.dataState
  984. else:
  985. self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
  986. frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
  987. return True
  988. def afterAttributeValueState(self):
  989. data = self.stream.char()
  990. if data in spaceCharacters:
  991. self.state = self.beforeAttributeNameState
  992. elif data == ">":
  993. self.emitCurrentToken()
  994. elif data == "/":
  995. self.state = self.selfClosingStartTagState
  996. elif data is EOF:
  997. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  998. "unexpected-EOF-after-attribute-value"})
  999. self.stream.unget(data)
  1000. self.state = self.dataState
  1001. else:
  1002. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1003. "unexpected-character-after-attribute-value"})
  1004. self.stream.unget(data)
  1005. self.state = self.beforeAttributeNameState
  1006. return True
  1007. def selfClosingStartTagState(self):
  1008. data = self.stream.char()
  1009. if data == ">":
  1010. self.currentToken["selfClosing"] = True
  1011. self.emitCurrentToken()
  1012. elif data is EOF:
  1013. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1014. "data":
  1015. "unexpected-EOF-after-solidus-in-tag"})
  1016. self.stream.unget(data)
  1017. self.state = self.dataState
  1018. else:
  1019. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1020. "unexpected-character-after-solidus-in-tag"})
  1021. self.stream.unget(data)
  1022. self.state = self.beforeAttributeNameState
  1023. return True
  1024. def bogusCommentState(self):
  1025. # Make a new comment token and give it as value all the characters
  1026. # until the first > or EOF (charsUntil checks for EOF automatically)
  1027. # and emit it.
  1028. data = self.stream.charsUntil(">")
  1029. data = data.replace("\u0000", "\uFFFD")
  1030. self.tokenQueue.append(
  1031. {"type": tokenTypes["Comment"], "data": data})
  1032. # Eat the character directly after the bogus comment which is either a
  1033. # ">" or an EOF.
  1034. self.stream.char()
  1035. self.state = self.dataState
  1036. return True
  1037. def markupDeclarationOpenState(self):
  1038. charStack = [self.stream.char()]
  1039. if charStack[-1] == "-":
  1040. charStack.append(self.stream.char())
  1041. if charStack[-1] == "-":
  1042. self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
  1043. self.state = self.commentStartState
  1044. return True
  1045. elif charStack[-1] in ('d', 'D'):
  1046. matched = True
  1047. for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
  1048. ('y', 'Y'), ('p', 'P'), ('e', 'E')):
  1049. charStack.append(self.stream.char())
  1050. if charStack[-1] not in expected:
  1051. matched = False
  1052. break
  1053. if matched:
  1054. self.currentToken = {"type": tokenTypes["Doctype"],
  1055. "name": "",
  1056. "publicId": None, "systemId": None,
  1057. "correct": True}
  1058. self.state = self.doctypeState
  1059. return True
  1060. elif (charStack[-1] == "[" and
  1061. self.parser is not None and
  1062. self.parser.tree.openElements and
  1063. self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
  1064. matched = True
  1065. for expected in ["C", "D", "A", "T", "A", "["]:
  1066. charStack.append(self.stream.char())
  1067. if charStack[-1] != expected:
  1068. matched = False
  1069. break
  1070. if matched:
  1071. self.state = self.cdataSectionState
  1072. return True
  1073. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1074. "expected-dashes-or-doctype"})
  1075. while charStack:
  1076. self.stream.unget(charStack.pop())
  1077. self.state = self.bogusCommentState
  1078. return True
  1079. def commentStartState(self):
  1080. data = self.stream.char()
  1081. if data == "-":
  1082. self.state = self.commentStartDashState
  1083. elif data == "\u0000":
  1084. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1085. "data": "invalid-codepoint"})
  1086. self.currentToken["data"] += "\uFFFD"
  1087. elif data == ">":
  1088. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1089. "incorrect-comment"})
  1090. self.tokenQueue.append(self.currentToken)
  1091. self.state = self.dataState
  1092. elif data is EOF:
  1093. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1094. "eof-in-comment"})
  1095. self.tokenQueue.append(self.currentToken)
  1096. self.state = self.dataState
  1097. else:
  1098. self.currentToken["data"] += data
  1099. self.state = self.commentState
  1100. return True
  1101. def commentStartDashState(self):
  1102. data = self.stream.char()
  1103. if data == "-":
  1104. self.state = self.commentEndState
  1105. elif data == "\u0000":
  1106. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1107. "data": "invalid-codepoint"})
  1108. self.currentToken["data"] += "-\uFFFD"
  1109. elif data == ">":
  1110. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1111. "incorrect-comment"})
  1112. self.tokenQueue.append(self.currentToken)
  1113. self.state = self.dataState
  1114. elif data is EOF:
  1115. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1116. "eof-in-comment"})
  1117. self.tokenQueue.append(self.currentToken)
  1118. self.state = self.dataState
  1119. else:
  1120. self.currentToken["data"] += "-" + data
  1121. self.state = self.commentState
  1122. return True
  1123. def commentState(self):
  1124. data = self.stream.char()
  1125. if data == "-":
  1126. self.state = self.commentEndDashState
  1127. elif data == "\u0000":
  1128. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1129. "data": "invalid-codepoint"})
  1130. self.currentToken["data"] += "\uFFFD"
  1131. elif data is EOF:
  1132. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1133. "data": "eof-in-comment"})
  1134. self.tokenQueue.append(self.currentToken)
  1135. self.state = self.dataState
  1136. else:
  1137. self.currentToken["data"] += data + \
  1138. self.stream.charsUntil(("-", "\u0000"))
  1139. return True
  1140. def commentEndDashState(self):
  1141. data = self.stream.char()
  1142. if data == "-":
  1143. self.state = self.commentEndState
  1144. elif data == "\u0000":
  1145. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1146. "data": "invalid-codepoint"})
  1147. self.currentToken["data"] += "-\uFFFD"
  1148. self.state = self.commentState
  1149. elif data is EOF:
  1150. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1151. "eof-in-comment-end-dash"})
  1152. self.tokenQueue.append(self.currentToken)
  1153. self.state = self.dataState
  1154. else:
  1155. self.currentToken["data"] += "-" + data
  1156. self.state = self.commentState
  1157. return True
  1158. def commentEndState(self):
  1159. data = self.stream.char()
  1160. if data == ">":
  1161. self.tokenQueue.append(self.currentToken)
  1162. self.state = self.dataState
  1163. elif data == "\u0000":
  1164. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1165. "data": "invalid-codepoint"})
  1166. self.currentToken["data"] += "--\uFFFD"
  1167. self.state = self.commentState
  1168. elif data == "!":
  1169. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1170. "unexpected-bang-after-double-dash-in-comment"})
  1171. self.state = self.commentEndBangState
  1172. elif data == "-":
  1173. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1174. "unexpected-dash-after-double-dash-in-comment"})
  1175. self.currentToken["data"] += data
  1176. elif data is EOF:
  1177. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1178. "eof-in-comment-double-dash"})
  1179. self.tokenQueue.append(self.currentToken)
  1180. self.state = self.dataState
  1181. else:
  1182. # XXX
  1183. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1184. "unexpected-char-in-comment"})
  1185. self.currentToken["data"] += "--" + data
  1186. self.state = self.commentState
  1187. return True
  1188. def commentEndBangState(self):
  1189. data = self.stream.char()
  1190. if data == ">":
  1191. self.tokenQueue.append(self.currentToken)
  1192. self.state = self.dataState
  1193. elif data == "-":
  1194. self.currentToken["data"] += "--!"
  1195. self.state = self.commentEndDashState
  1196. elif data == "\u0000":
  1197. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1198. "data": "invalid-codepoint"})
  1199. self.currentToken["data"] += "--!\uFFFD"
  1200. self.state = self.commentState
  1201. elif data is EOF:
  1202. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1203. "eof-in-comment-end-bang-state"})
  1204. self.tokenQueue.append(self.currentToken)
  1205. self.state = self.dataState
  1206. else:
  1207. self.currentToken["data"] += "--!" + data
  1208. self.state = self.commentState
  1209. return True
  1210. def doctypeState(self):
  1211. data = self.stream.char()
  1212. if data in spaceCharacters:
  1213. self.state = self.beforeDoctypeNameState
  1214. elif data is EOF:
  1215. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1216. "expected-doctype-name-but-got-eof"})
  1217. self.currentToken["correct"] = False
  1218. self.tokenQueue.append(self.currentToken)
  1219. self.state = self.dataState
  1220. else:
  1221. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1222. "need-space-after-doctype"})
  1223. self.stream.unget(data)
  1224. self.state = self.beforeDoctypeNameState
  1225. return True
  1226. def beforeDoctypeNameState(self):
  1227. data = self.stream.char()
  1228. if data in spaceCharacters:
  1229. pass
  1230. elif data == ">":
  1231. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1232. "expected-doctype-name-but-got-right-bracket"})
  1233. self.currentToken["correct"] = False
  1234. self.tokenQueue.append(self.currentToken)
  1235. self.state = self.dataState
  1236. elif data == "\u0000":
  1237. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1238. "data": "invalid-codepoint"})
  1239. self.currentToken["name"] = "\uFFFD"
  1240. self.state = self.doctypeNameState
  1241. elif data is EOF:
  1242. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1243. "expected-doctype-name-but-got-eof"})
  1244. self.currentToken["correct"] = False
  1245. self.tokenQueue.append(self.currentToken)
  1246. self.state = self.dataState
  1247. else:
  1248. self.currentToken["name"] = data
  1249. self.state = self.doctypeNameState
  1250. return True
  1251. def doctypeNameState(self):
  1252. data = self.stream.char()
  1253. if data in spaceCharacters:
  1254. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1255. self.state = self.afterDoctypeNameState
  1256. elif data == ">":
  1257. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1258. self.tokenQueue.append(self.currentToken)
  1259. self.state = self.dataState
  1260. elif data == "\u0000":
  1261. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1262. "data": "invalid-codepoint"})
  1263. self.currentToken["name"] += "\uFFFD"
  1264. self.state = self.doctypeNameState
  1265. elif data is EOF:
  1266. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1267. "eof-in-doctype-name"})
  1268. self.currentToken["correct"] = False
  1269. self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
  1270. self.tokenQueue.append(self.currentToken)
  1271. self.state = self.dataState
  1272. else:
  1273. self.currentToken["name"] += data
  1274. return True
  1275. def afterDoctypeNameState(self):
  1276. data = self.stream.char()
  1277. if data in spaceCharacters:
  1278. pass
  1279. elif data == ">":
  1280. self.tokenQueue.append(self.currentToken)
  1281. self.state = self.dataState
  1282. elif data is EOF:
  1283. self.currentToken["correct"] = False
  1284. self.stream.unget(data)
  1285. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1286. "eof-in-doctype"})
  1287. self.tokenQueue.append(self.currentToken)
  1288. self.state = self.dataState
  1289. else:
  1290. if data in ("p", "P"):
  1291. matched = True
  1292. for expected in (("u", "U"), ("b", "B"), ("l", "L"),
  1293. ("i", "I"), ("c", "C")):
  1294. data = self.stream.char()
  1295. if data not in expected:
  1296. matched = False
  1297. break
  1298. if matched:
  1299. self.state = self.afterDoctypePublicKeywordState
  1300. return True
  1301. elif data in ("s", "S"):
  1302. matched = True
  1303. for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
  1304. ("e", "E"), ("m", "M")):
  1305. data = self.stream.char()
  1306. if data not in expected:
  1307. matched = False
  1308. break
  1309. if matched:
  1310. self.state = self.afterDoctypeSystemKeywordState
  1311. return True
  1312. # All the characters read before the current 'data' will be
  1313. # [a-zA-Z], so they're garbage in the bogus doctype and can be
  1314. # discarded; only the latest character might be '>' or EOF
  1315. # and needs to be ungetted
  1316. self.stream.unget(data)
  1317. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1318. "expected-space-or-right-bracket-in-doctype", "datavars":
  1319. {"data": data}})
  1320. self.currentToken["correct"] = False
  1321. self.state = self.bogusDoctypeState
  1322. return True
  1323. def afterDoctypePublicKeywordState(self):
  1324. data = self.stream.char()
  1325. if data in spaceCharacters:
  1326. self.state = self.beforeDoctypePublicIdentifierState
  1327. elif data in ("'", '"'):
  1328. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1329. "unexpected-char-in-doctype"})
  1330. self.stream.unget(data)
  1331. self.state = self.beforeDoctypePublicIdentifierState
  1332. elif data is EOF:
  1333. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1334. "eof-in-doctype"})
  1335. self.currentToken["correct"] = False
  1336. self.tokenQueue.append(self.currentToken)
  1337. self.state = self.dataState
  1338. else:
  1339. self.stream.unget(data)
  1340. self.state = self.beforeDoctypePublicIdentifierState
  1341. return True
  1342. def beforeDoctypePublicIdentifierState(self):
  1343. data = self.stream.char()
  1344. if data in spaceCharacters:
  1345. pass
  1346. elif data == "\"":
  1347. self.currentToken["publicId"] = ""
  1348. self.state = self.doctypePublicIdentifierDoubleQuotedState
  1349. elif data == "'":
  1350. self.currentToken["publicId"] = ""
  1351. self.state = self.doctypePublicIdentifierSingleQuotedState
  1352. elif data == ">":
  1353. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1354. "unexpected-end-of-doctype"})
  1355. self.currentToken["correct"] = False
  1356. self.tokenQueue.append(self.currentToken)
  1357. self.state = self.dataState
  1358. elif data is EOF:
  1359. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1360. "eof-in-doctype"})
  1361. self.currentToken["correct"] = False
  1362. self.tokenQueue.append(self.currentToken)
  1363. self.state = self.dataState
  1364. else:
  1365. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1366. "unexpected-char-in-doctype"})
  1367. self.currentToken["correct"] = False
  1368. self.state = self.bogusDoctypeState
  1369. return True
  1370. def doctypePublicIdentifierDoubleQuotedState(self):
  1371. data = self.stream.char()
  1372. if data == "\"":
  1373. self.state = self.afterDoctypePublicIdentifierState
  1374. elif data == "\u0000":
  1375. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1376. "data": "invalid-codepoint"})
  1377. self.currentToken["publicId"] += "\uFFFD"
  1378. elif data == ">":
  1379. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1380. "unexpected-end-of-doctype"})
  1381. self.currentToken["correct"] = False
  1382. self.tokenQueue.append(self.currentToken)
  1383. self.state = self.dataState
  1384. elif data is EOF:
  1385. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1386. "eof-in-doctype"})
  1387. self.currentToken["correct"] = False
  1388. self.tokenQueue.append(self.currentToken)
  1389. self.state = self.dataState
  1390. else:
  1391. self.currentToken["publicId"] += data
  1392. return True
  1393. def doctypePublicIdentifierSingleQuotedState(self):
  1394. data = self.stream.char()
  1395. if data == "'":
  1396. self.state = self.afterDoctypePublicIdentifierState
  1397. elif data == "\u0000":
  1398. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1399. "data": "invalid-codepoint"})
  1400. self.currentToken["publicId"] += "\uFFFD"
  1401. elif data == ">":
  1402. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1403. "unexpected-end-of-doctype"})
  1404. self.currentToken["correct"] = False
  1405. self.tokenQueue.append(self.currentToken)
  1406. self.state = self.dataState
  1407. elif data is EOF:
  1408. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1409. "eof-in-doctype"})
  1410. self.currentToken["correct"] = False
  1411. self.tokenQueue.append(self.currentToken)
  1412. self.state = self.dataState
  1413. else:
  1414. self.currentToken["publicId"] += data
  1415. return True
  1416. def afterDoctypePublicIdentifierState(self):
  1417. data = self.stream.char()
  1418. if data in spaceCharacters:
  1419. self.state = self.betweenDoctypePublicAndSystemIdentifiersState
  1420. elif data == ">":
  1421. self.tokenQueue.append(self.currentToken)
  1422. self.state = self.dataState
  1423. elif data == '"':
  1424. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1425. "unexpected-char-in-doctype"})
  1426. self.currentToken["systemId"] = ""
  1427. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1428. elif data == "'":
  1429. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1430. "unexpected-char-in-doctype"})
  1431. self.currentToken["systemId"] = ""
  1432. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1433. elif data is EOF:
  1434. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1435. "eof-in-doctype"})
  1436. self.currentToken["correct"] = False
  1437. self.tokenQueue.append(self.currentToken)
  1438. self.state = self.dataState
  1439. else:
  1440. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1441. "unexpected-char-in-doctype"})
  1442. self.currentToken["correct"] = False
  1443. self.state = self.bogusDoctypeState
  1444. return True
  1445. def betweenDoctypePublicAndSystemIdentifiersState(self):
  1446. data = self.stream.char()
  1447. if data in spaceCharacters:
  1448. pass
  1449. elif data == ">":
  1450. self.tokenQueue.append(self.currentToken)
  1451. self.state = self.dataState
  1452. elif data == '"':
  1453. self.currentToken["systemId"] = ""
  1454. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1455. elif data == "'":
  1456. self.currentToken["systemId"] = ""
  1457. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1458. elif data == EOF:
  1459. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1460. "eof-in-doctype"})
  1461. self.currentToken["correct"] = False
  1462. self.tokenQueue.append(self.currentToken)
  1463. self.state = self.dataState
  1464. else:
  1465. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1466. "unexpected-char-in-doctype"})
  1467. self.currentToken["correct"] = False
  1468. self.state = self.bogusDoctypeState
  1469. return True
  1470. def afterDoctypeSystemKeywordState(self):
  1471. data = self.stream.char()
  1472. if data in spaceCharacters:
  1473. self.state = self.beforeDoctypeSystemIdentifierState
  1474. elif data in ("'", '"'):
  1475. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1476. "unexpected-char-in-doctype"})
  1477. self.stream.unget(data)
  1478. self.state = self.beforeDoctypeSystemIdentifierState
  1479. elif data is EOF:
  1480. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1481. "eof-in-doctype"})
  1482. self.currentToken["correct"] = False
  1483. self.tokenQueue.append(self.currentToken)
  1484. self.state = self.dataState
  1485. else:
  1486. self.stream.unget(data)
  1487. self.state = self.beforeDoctypeSystemIdentifierState
  1488. return True
  1489. def beforeDoctypeSystemIdentifierState(self):
  1490. data = self.stream.char()
  1491. if data in spaceCharacters:
  1492. pass
  1493. elif data == "\"":
  1494. self.currentToken["systemId"] = ""
  1495. self.state = self.doctypeSystemIdentifierDoubleQuotedState
  1496. elif data == "'":
  1497. self.currentToken["systemId"] = ""
  1498. self.state = self.doctypeSystemIdentifierSingleQuotedState
  1499. elif data == ">":
  1500. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1501. "unexpected-char-in-doctype"})
  1502. self.currentToken["correct"] = False
  1503. self.tokenQueue.append(self.currentToken)
  1504. self.state = self.dataState
  1505. elif data is EOF:
  1506. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1507. "eof-in-doctype"})
  1508. self.currentToken["correct"] = False
  1509. self.tokenQueue.append(self.currentToken)
  1510. self.state = self.dataState
  1511. else:
  1512. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1513. "unexpected-char-in-doctype"})
  1514. self.currentToken["correct"] = False
  1515. self.state = self.bogusDoctypeState
  1516. return True
  1517. def doctypeSystemIdentifierDoubleQuotedState(self):
  1518. data = self.stream.char()
  1519. if data == "\"":
  1520. self.state = self.afterDoctypeSystemIdentifierState
  1521. elif data == "\u0000":
  1522. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1523. "data": "invalid-codepoint"})
  1524. self.currentToken["systemId"] += "\uFFFD"
  1525. elif data == ">":
  1526. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1527. "unexpected-end-of-doctype"})
  1528. self.currentToken["correct"] = False
  1529. self.tokenQueue.append(self.currentToken)
  1530. self.state = self.dataState
  1531. elif data is EOF:
  1532. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1533. "eof-in-doctype"})
  1534. self.currentToken["correct"] = False
  1535. self.tokenQueue.append(self.currentToken)
  1536. self.state = self.dataState
  1537. else:
  1538. self.currentToken["systemId"] += data
  1539. return True
  1540. def doctypeSystemIdentifierSingleQuotedState(self):
  1541. data = self.stream.char()
  1542. if data == "'":
  1543. self.state = self.afterDoctypeSystemIdentifierState
  1544. elif data == "\u0000":
  1545. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1546. "data": "invalid-codepoint"})
  1547. self.currentToken["systemId"] += "\uFFFD"
  1548. elif data == ">":
  1549. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1550. "unexpected-end-of-doctype"})
  1551. self.currentToken["correct"] = False
  1552. self.tokenQueue.append(self.currentToken)
  1553. self.state = self.dataState
  1554. elif data is EOF:
  1555. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1556. "eof-in-doctype"})
  1557. self.currentToken["correct"] = False
  1558. self.tokenQueue.append(self.currentToken)
  1559. self.state = self.dataState
  1560. else:
  1561. self.currentToken["systemId"] += data
  1562. return True
  1563. def afterDoctypeSystemIdentifierState(self):
  1564. data = self.stream.char()
  1565. if data in spaceCharacters:
  1566. pass
  1567. elif data == ">":
  1568. self.tokenQueue.append(self.currentToken)
  1569. self.state = self.dataState
  1570. elif data is EOF:
  1571. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1572. "eof-in-doctype"})
  1573. self.currentToken["correct"] = False
  1574. self.tokenQueue.append(self.currentToken)
  1575. self.state = self.dataState
  1576. else:
  1577. self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
  1578. "unexpected-char-in-doctype"})
  1579. self.state = self.bogusDoctypeState
  1580. return True
  1581. def bogusDoctypeState(self):
  1582. data = self.stream.char()
  1583. if data == ">":
  1584. self.tokenQueue.append(self.currentToken)
  1585. self.state = self.dataState
  1586. elif data is EOF:
  1587. # XXX EMIT
  1588. self.stream.unget(data)
  1589. self.tokenQueue.append(self.currentToken)
  1590. self.state = self.dataState
  1591. else:
  1592. pass
  1593. return True
  1594. def cdataSectionState(self):
  1595. data = []
  1596. while True:
  1597. data.append(self.stream.charsUntil("]"))
  1598. data.append(self.stream.charsUntil(">"))
  1599. char = self.stream.char()
  1600. if char == EOF:
  1601. break
  1602. else:
  1603. assert char == ">"
  1604. if data[-1][-2:] == "]]":
  1605. data[-1] = data[-1][:-2]
  1606. break
  1607. else:
  1608. data.append(char)
  1609. data = "".join(data)
  1610. # Deal with null here rather than in the parser
  1611. nullCount = data.count("\u0000")
  1612. if nullCount > 0:
  1613. for i in range(nullCount):
  1614. self.tokenQueue.append({"type": tokenTypes["ParseError"],
  1615. "data": "invalid-codepoint"})
  1616. data = data.replace("\u0000", "\uFFFD")
  1617. if data:
  1618. self.tokenQueue.append({"type": tokenTypes["Characters"],
  1619. "data": data})
  1620. self.state = self.dataState
  1621. return True