You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

903 lines
31 KiB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
  1. from __future__ import absolute_import, division, unicode_literals
  2. from six import text_type
  3. from six.moves import http_client
  4. import codecs
  5. import re
  6. from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
  7. from .constants import encodings, ReparseException
  8. from . import utils
  9. from io import StringIO
  10. try:
  11. from io import BytesIO
  12. except ImportError:
  13. BytesIO = StringIO
  14. try:
  15. from io import BufferedIOBase
  16. except ImportError:
  17. class BufferedIOBase(object):
  18. pass
  19. # Non-unicode versions of constants for use in the pre-parser
  20. spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
  21. asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
  22. asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
  23. spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
  24. invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
  25. if utils.supports_lone_surrogates:
  26. # Use one extra step of indirection and create surrogates with
  27. # unichr. Not using this indirection would introduce an illegal
  28. # unicode literal on platforms not supporting such lone
  29. # surrogates.
  30. invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
  31. eval('"\\uD800-\\uDFFF"'))
  32. else:
  33. invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
  34. non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
  35. 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
  36. 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
  37. 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
  38. 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
  39. 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
  40. 0x10FFFE, 0x10FFFF])
  41. ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
  42. # Cache for charsUntil()
  43. charsUntilRegEx = {}
  44. class BufferedStream(object):
  45. """Buffering for streams that do not have buffering of their own
  46. The buffer is implemented as a list of chunks on the assumption that
  47. joining many strings will be slow since it is O(n**2)
  48. """
  49. def __init__(self, stream):
  50. self.stream = stream
  51. self.buffer = []
  52. self.position = [-1, 0] # chunk number, offset
  53. def tell(self):
  54. pos = 0
  55. for chunk in self.buffer[:self.position[0]]:
  56. pos += len(chunk)
  57. pos += self.position[1]
  58. return pos
  59. def seek(self, pos):
  60. assert pos <= self._bufferedBytes()
  61. offset = pos
  62. i = 0
  63. while len(self.buffer[i]) < offset:
  64. offset -= len(self.buffer[i])
  65. i += 1
  66. self.position = [i, offset]
  67. def read(self, bytes):
  68. if not self.buffer:
  69. return self._readStream(bytes)
  70. elif (self.position[0] == len(self.buffer) and
  71. self.position[1] == len(self.buffer[-1])):
  72. return self._readStream(bytes)
  73. else:
  74. return self._readFromBuffer(bytes)
  75. def _bufferedBytes(self):
  76. return sum([len(item) for item in self.buffer])
  77. def _readStream(self, bytes):
  78. data = self.stream.read(bytes)
  79. self.buffer.append(data)
  80. self.position[0] += 1
  81. self.position[1] = len(data)
  82. return data
  83. def _readFromBuffer(self, bytes):
  84. remainingBytes = bytes
  85. rv = []
  86. bufferIndex = self.position[0]
  87. bufferOffset = self.position[1]
  88. while bufferIndex < len(self.buffer) and remainingBytes != 0:
  89. assert remainingBytes > 0
  90. bufferedData = self.buffer[bufferIndex]
  91. if remainingBytes <= len(bufferedData) - bufferOffset:
  92. bytesToRead = remainingBytes
  93. self.position = [bufferIndex, bufferOffset + bytesToRead]
  94. else:
  95. bytesToRead = len(bufferedData) - bufferOffset
  96. self.position = [bufferIndex, len(bufferedData)]
  97. bufferIndex += 1
  98. rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
  99. remainingBytes -= bytesToRead
  100. bufferOffset = 0
  101. if remainingBytes:
  102. rv.append(self._readStream(remainingBytes))
  103. return b"".join(rv)
  104. def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
  105. if isinstance(source, http_client.HTTPResponse):
  106. # Work around Python bug #20007: read(0) closes the connection.
  107. # http://bugs.python.org/issue20007
  108. isUnicode = False
  109. elif hasattr(source, "read"):
  110. isUnicode = isinstance(source.read(0), text_type)
  111. else:
  112. isUnicode = isinstance(source, text_type)
  113. if isUnicode:
  114. if encoding is not None:
  115. raise TypeError("Cannot explicitly set an encoding with a unicode string")
  116. return HTMLUnicodeInputStream(source)
  117. else:
  118. return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
  119. class HTMLUnicodeInputStream(object):
  120. """Provides a unicode stream of characters to the HTMLTokenizer.
  121. This class takes care of character encoding and removing or replacing
  122. incorrect byte-sequences and also provides column and line tracking.
  123. """
  124. _defaultChunkSize = 10240
  125. def __init__(self, source):
  126. """Initialises the HTMLInputStream.
  127. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  128. for use by html5lib.
  129. source can be either a file-object, local filename or a string.
  130. The optional encoding parameter must be a string that indicates
  131. the encoding. If specified, that encoding will be used,
  132. regardless of any BOM or later declaration (such as in a meta
  133. element)
  134. parseMeta - Look for a <meta> element containing encoding information
  135. """
  136. if not utils.supports_lone_surrogates:
  137. # Such platforms will have already checked for such
  138. # surrogate errors, so no need to do this checking.
  139. self.reportCharacterErrors = None
  140. self.replaceCharactersRegexp = None
  141. elif len("\U0010FFFF") == 1:
  142. self.reportCharacterErrors = self.characterErrorsUCS4
  143. self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
  144. else:
  145. self.reportCharacterErrors = self.characterErrorsUCS2
  146. self.replaceCharactersRegexp = re.compile(
  147. eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
  148. # List of where new lines occur
  149. self.newLines = [0]
  150. self.charEncoding = ("utf-8", "certain")
  151. self.dataStream = self.openStream(source)
  152. self.reset()
  153. def reset(self):
  154. self.chunk = ""
  155. self.chunkSize = 0
  156. self.chunkOffset = 0
  157. self.errors = []
  158. # number of (complete) lines in previous chunks
  159. self.prevNumLines = 0
  160. # number of columns in the last line of the previous chunk
  161. self.prevNumCols = 0
  162. # Deal with CR LF and surrogates split over chunk boundaries
  163. self._bufferedCharacter = None
  164. def openStream(self, source):
  165. """Produces a file object from source.
  166. source can be either a file object, local filename or a string.
  167. """
  168. # Already a file object
  169. if hasattr(source, 'read'):
  170. stream = source
  171. else:
  172. stream = StringIO(source)
  173. return stream
  174. def _position(self, offset):
  175. chunk = self.chunk
  176. nLines = chunk.count('\n', 0, offset)
  177. positionLine = self.prevNumLines + nLines
  178. lastLinePos = chunk.rfind('\n', 0, offset)
  179. if lastLinePos == -1:
  180. positionColumn = self.prevNumCols + offset
  181. else:
  182. positionColumn = offset - (lastLinePos + 1)
  183. return (positionLine, positionColumn)
  184. def position(self):
  185. """Returns (line, col) of the current position in the stream."""
  186. line, col = self._position(self.chunkOffset)
  187. return (line + 1, col)
  188. def char(self):
  189. """ Read one character from the stream or queue if available. Return
  190. EOF when EOF is reached.
  191. """
  192. # Read a new chunk from the input stream if necessary
  193. if self.chunkOffset >= self.chunkSize:
  194. if not self.readChunk():
  195. return EOF
  196. chunkOffset = self.chunkOffset
  197. char = self.chunk[chunkOffset]
  198. self.chunkOffset = chunkOffset + 1
  199. return char
  200. def readChunk(self, chunkSize=None):
  201. if chunkSize is None:
  202. chunkSize = self._defaultChunkSize
  203. self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
  204. self.chunk = ""
  205. self.chunkSize = 0
  206. self.chunkOffset = 0
  207. data = self.dataStream.read(chunkSize)
  208. # Deal with CR LF and surrogates broken across chunks
  209. if self._bufferedCharacter:
  210. data = self._bufferedCharacter + data
  211. self._bufferedCharacter = None
  212. elif not data:
  213. # We have no more data, bye-bye stream
  214. return False
  215. if len(data) > 1:
  216. lastv = ord(data[-1])
  217. if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
  218. self._bufferedCharacter = data[-1]
  219. data = data[:-1]
  220. if self.reportCharacterErrors:
  221. self.reportCharacterErrors(data)
  222. # Replace invalid characters
  223. # Note U+0000 is dealt with in the tokenizer
  224. data = self.replaceCharactersRegexp.sub("\ufffd", data)
  225. data = data.replace("\r\n", "\n")
  226. data = data.replace("\r", "\n")
  227. self.chunk = data
  228. self.chunkSize = len(data)
  229. return True
  230. def characterErrorsUCS4(self, data):
  231. for i in range(len(invalid_unicode_re.findall(data))):
  232. self.errors.append("invalid-codepoint")
  233. def characterErrorsUCS2(self, data):
  234. # Someone picked the wrong compile option
  235. # You lose
  236. skip = False
  237. for match in invalid_unicode_re.finditer(data):
  238. if skip:
  239. continue
  240. codepoint = ord(match.group())
  241. pos = match.start()
  242. # Pretty sure there should be endianness issues here
  243. if utils.isSurrogatePair(data[pos:pos + 2]):
  244. # We have a surrogate pair!
  245. char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
  246. if char_val in non_bmp_invalid_codepoints:
  247. self.errors.append("invalid-codepoint")
  248. skip = True
  249. elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
  250. pos == len(data) - 1):
  251. self.errors.append("invalid-codepoint")
  252. else:
  253. skip = False
  254. self.errors.append("invalid-codepoint")
  255. def charsUntil(self, characters, opposite=False):
  256. """ Returns a string of characters from the stream up to but not
  257. including any character in 'characters' or EOF. 'characters' must be
  258. a container that supports the 'in' method and iteration over its
  259. characters.
  260. """
  261. # Use a cache of regexps to find the required characters
  262. try:
  263. chars = charsUntilRegEx[(characters, opposite)]
  264. except KeyError:
  265. if __debug__:
  266. for c in characters:
  267. assert(ord(c) < 128)
  268. regex = "".join(["\\x%02x" % ord(c) for c in characters])
  269. if not opposite:
  270. regex = "^%s" % regex
  271. chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
  272. rv = []
  273. while True:
  274. # Find the longest matching prefix
  275. m = chars.match(self.chunk, self.chunkOffset)
  276. if m is None:
  277. # If nothing matched, and it wasn't because we ran out of chunk,
  278. # then stop
  279. if self.chunkOffset != self.chunkSize:
  280. break
  281. else:
  282. end = m.end()
  283. # If not the whole chunk matched, return everything
  284. # up to the part that didn't match
  285. if end != self.chunkSize:
  286. rv.append(self.chunk[self.chunkOffset:end])
  287. self.chunkOffset = end
  288. break
  289. # If the whole remainder of the chunk matched,
  290. # use it all and read the next chunk
  291. rv.append(self.chunk[self.chunkOffset:])
  292. if not self.readChunk():
  293. # Reached EOF
  294. break
  295. r = "".join(rv)
  296. return r
  297. def unget(self, char):
  298. # Only one character is allowed to be ungotten at once - it must
  299. # be consumed again before any further call to unget
  300. if char is not None:
  301. if self.chunkOffset == 0:
  302. # unget is called quite rarely, so it's a good idea to do
  303. # more work here if it saves a bit of work in the frequently
  304. # called char and charsUntil.
  305. # So, just prepend the ungotten character onto the current
  306. # chunk:
  307. self.chunk = char + self.chunk
  308. self.chunkSize += 1
  309. else:
  310. self.chunkOffset -= 1
  311. assert self.chunk[self.chunkOffset] == char
  312. class HTMLBinaryInputStream(HTMLUnicodeInputStream):
  313. """Provides a unicode stream of characters to the HTMLTokenizer.
  314. This class takes care of character encoding and removing or replacing
  315. incorrect byte-sequences and also provides column and line tracking.
  316. """
  317. def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
  318. """Initialises the HTMLInputStream.
  319. HTMLInputStream(source, [encoding]) -> Normalized stream from source
  320. for use by html5lib.
  321. source can be either a file-object, local filename or a string.
  322. The optional encoding parameter must be a string that indicates
  323. the encoding. If specified, that encoding will be used,
  324. regardless of any BOM or later declaration (such as in a meta
  325. element)
  326. parseMeta - Look for a <meta> element containing encoding information
  327. """
  328. # Raw Stream - for unicode objects this will encode to utf-8 and set
  329. # self.charEncoding as appropriate
  330. self.rawStream = self.openStream(source)
  331. HTMLUnicodeInputStream.__init__(self, self.rawStream)
  332. self.charEncoding = (codecName(encoding), "certain")
  333. # Encoding Information
  334. # Number of bytes to use when looking for a meta element with
  335. # encoding information
  336. self.numBytesMeta = 512
  337. # Number of bytes to use when using detecting encoding using chardet
  338. self.numBytesChardet = 100
  339. # Encoding to use if no other information can be found
  340. self.defaultEncoding = "windows-1252"
  341. # Detect encoding iff no explicit "transport level" encoding is supplied
  342. if (self.charEncoding[0] is None):
  343. self.charEncoding = self.detectEncoding(parseMeta, chardet)
  344. # Call superclass
  345. self.reset()
  346. def reset(self):
  347. self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
  348. 'replace')
  349. HTMLUnicodeInputStream.reset(self)
  350. def openStream(self, source):
  351. """Produces a file object from source.
  352. source can be either a file object, local filename or a string.
  353. """
  354. # Already a file object
  355. if hasattr(source, 'read'):
  356. stream = source
  357. else:
  358. stream = BytesIO(source)
  359. try:
  360. stream.seek(stream.tell())
  361. except:
  362. stream = BufferedStream(stream)
  363. return stream
  364. def detectEncoding(self, parseMeta=True, chardet=True):
  365. # First look for a BOM
  366. # This will also read past the BOM if present
  367. encoding = self.detectBOM()
  368. confidence = "certain"
  369. # If there is no BOM need to look for meta elements with encoding
  370. # information
  371. if encoding is None and parseMeta:
  372. encoding = self.detectEncodingMeta()
  373. confidence = "tentative"
  374. # Guess with chardet, if avaliable
  375. if encoding is None and chardet:
  376. confidence = "tentative"
  377. try:
  378. try:
  379. from charade.universaldetector import UniversalDetector
  380. except ImportError:
  381. from chardet.universaldetector import UniversalDetector
  382. buffers = []
  383. detector = UniversalDetector()
  384. while not detector.done:
  385. buffer = self.rawStream.read(self.numBytesChardet)
  386. assert isinstance(buffer, bytes)
  387. if not buffer:
  388. break
  389. buffers.append(buffer)
  390. detector.feed(buffer)
  391. detector.close()
  392. encoding = detector.result['encoding']
  393. self.rawStream.seek(0)
  394. except ImportError:
  395. pass
  396. # If all else fails use the default encoding
  397. if encoding is None:
  398. confidence = "tentative"
  399. encoding = self.defaultEncoding
  400. # Substitute for equivalent encodings:
  401. encodingSub = {"iso-8859-1": "windows-1252"}
  402. if encoding.lower() in encodingSub:
  403. encoding = encodingSub[encoding.lower()]
  404. return encoding, confidence
  405. def changeEncoding(self, newEncoding):
  406. assert self.charEncoding[1] != "certain"
  407. newEncoding = codecName(newEncoding)
  408. if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
  409. newEncoding = "utf-8"
  410. if newEncoding is None:
  411. return
  412. elif newEncoding == self.charEncoding[0]:
  413. self.charEncoding = (self.charEncoding[0], "certain")
  414. else:
  415. self.rawStream.seek(0)
  416. self.reset()
  417. self.charEncoding = (newEncoding, "certain")
  418. raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
  419. def detectBOM(self):
  420. """Attempts to detect at BOM at the start of the stream. If
  421. an encoding can be determined from the BOM return the name of the
  422. encoding otherwise return None"""
  423. bomDict = {
  424. codecs.BOM_UTF8: 'utf-8',
  425. codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
  426. codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
  427. }
  428. # Go to beginning of file and read in 4 bytes
  429. string = self.rawStream.read(4)
  430. assert isinstance(string, bytes)
  431. # Try detecting the BOM using bytes from the string
  432. encoding = bomDict.get(string[:3]) # UTF-8
  433. seek = 3
  434. if not encoding:
  435. # Need to detect UTF-32 before UTF-16
  436. encoding = bomDict.get(string) # UTF-32
  437. seek = 4
  438. if not encoding:
  439. encoding = bomDict.get(string[:2]) # UTF-16
  440. seek = 2
  441. # Set the read position past the BOM if one was found, otherwise
  442. # set it to the start of the stream
  443. self.rawStream.seek(encoding and seek or 0)
  444. return encoding
  445. def detectEncodingMeta(self):
  446. """Report the encoding declared by the meta element
  447. """
  448. buffer = self.rawStream.read(self.numBytesMeta)
  449. assert isinstance(buffer, bytes)
  450. parser = EncodingParser(buffer)
  451. self.rawStream.seek(0)
  452. encoding = parser.getEncoding()
  453. if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
  454. encoding = "utf-8"
  455. return encoding
  456. class EncodingBytes(bytes):
  457. """String-like object with an associated position and various extra methods
  458. If the position is ever greater than the string length then an exception is
  459. raised"""
  460. def __new__(self, value):
  461. assert isinstance(value, bytes)
  462. return bytes.__new__(self, value.lower())
  463. def __init__(self, value):
  464. self._position = -1
  465. def __iter__(self):
  466. return self
  467. def __next__(self):
  468. p = self._position = self._position + 1
  469. if p >= len(self):
  470. raise StopIteration
  471. elif p < 0:
  472. raise TypeError
  473. return self[p:p + 1]
  474. def next(self):
  475. # Py2 compat
  476. return self.__next__()
  477. def previous(self):
  478. p = self._position
  479. if p >= len(self):
  480. raise StopIteration
  481. elif p < 0:
  482. raise TypeError
  483. self._position = p = p - 1
  484. return self[p:p + 1]
  485. def setPosition(self, position):
  486. if self._position >= len(self):
  487. raise StopIteration
  488. self._position = position
  489. def getPosition(self):
  490. if self._position >= len(self):
  491. raise StopIteration
  492. if self._position >= 0:
  493. return self._position
  494. else:
  495. return None
  496. position = property(getPosition, setPosition)
  497. def getCurrentByte(self):
  498. return self[self.position:self.position + 1]
  499. currentByte = property(getCurrentByte)
  500. def skip(self, chars=spaceCharactersBytes):
  501. """Skip past a list of characters"""
  502. p = self.position # use property for the error-checking
  503. while p < len(self):
  504. c = self[p:p + 1]
  505. if c not in chars:
  506. self._position = p
  507. return c
  508. p += 1
  509. self._position = p
  510. return None
  511. def skipUntil(self, chars):
  512. p = self.position
  513. while p < len(self):
  514. c = self[p:p + 1]
  515. if c in chars:
  516. self._position = p
  517. return c
  518. p += 1
  519. self._position = p
  520. return None
  521. def matchBytes(self, bytes):
  522. """Look for a sequence of bytes at the start of a string. If the bytes
  523. are found return True and advance the position to the byte after the
  524. match. Otherwise return False and leave the position alone"""
  525. p = self.position
  526. data = self[p:p + len(bytes)]
  527. rv = data.startswith(bytes)
  528. if rv:
  529. self.position += len(bytes)
  530. return rv
  531. def jumpTo(self, bytes):
  532. """Look for the next sequence of bytes matching a given sequence. If
  533. a match is found advance the position to the last byte of the match"""
  534. newPosition = self[self.position:].find(bytes)
  535. if newPosition > -1:
  536. # XXX: This is ugly, but I can't see a nicer way to fix this.
  537. if self._position == -1:
  538. self._position = 0
  539. self._position += (newPosition + len(bytes) - 1)
  540. return True
  541. else:
  542. raise StopIteration
  543. class EncodingParser(object):
  544. """Mini parser for detecting character encoding from meta elements"""
  545. def __init__(self, data):
  546. """string - the data to work on for encoding detection"""
  547. self.data = EncodingBytes(data)
  548. self.encoding = None
  549. def getEncoding(self):
  550. methodDispatch = (
  551. (b"<!--", self.handleComment),
  552. (b"<meta", self.handleMeta),
  553. (b"</", self.handlePossibleEndTag),
  554. (b"<!", self.handleOther),
  555. (b"<?", self.handleOther),
  556. (b"<", self.handlePossibleStartTag))
  557. for byte in self.data:
  558. keepParsing = True
  559. for key, method in methodDispatch:
  560. if self.data.matchBytes(key):
  561. try:
  562. keepParsing = method()
  563. break
  564. except StopIteration:
  565. keepParsing = False
  566. break
  567. if not keepParsing:
  568. break
  569. return self.encoding
  570. def handleComment(self):
  571. """Skip over comments"""
  572. return self.data.jumpTo(b"-->")
  573. def handleMeta(self):
  574. if self.data.currentByte not in spaceCharactersBytes:
  575. # if we have <meta not followed by a space so just keep going
  576. return True
  577. # We have a valid meta element we want to search for attributes
  578. hasPragma = False
  579. pendingEncoding = None
  580. while True:
  581. # Try to find the next attribute after the current position
  582. attr = self.getAttribute()
  583. if attr is None:
  584. return True
  585. else:
  586. if attr[0] == b"http-equiv":
  587. hasPragma = attr[1] == b"content-type"
  588. if hasPragma and pendingEncoding is not None:
  589. self.encoding = pendingEncoding
  590. return False
  591. elif attr[0] == b"charset":
  592. tentativeEncoding = attr[1]
  593. codec = codecName(tentativeEncoding)
  594. if codec is not None:
  595. self.encoding = codec
  596. return False
  597. elif attr[0] == b"content":
  598. contentParser = ContentAttrParser(EncodingBytes(attr[1]))
  599. tentativeEncoding = contentParser.parse()
  600. if tentativeEncoding is not None:
  601. codec = codecName(tentativeEncoding)
  602. if codec is not None:
  603. if hasPragma:
  604. self.encoding = codec
  605. return False
  606. else:
  607. pendingEncoding = codec
  608. def handlePossibleStartTag(self):
  609. return self.handlePossibleTag(False)
  610. def handlePossibleEndTag(self):
  611. next(self.data)
  612. return self.handlePossibleTag(True)
  613. def handlePossibleTag(self, endTag):
  614. data = self.data
  615. if data.currentByte not in asciiLettersBytes:
  616. # If the next byte is not an ascii letter either ignore this
  617. # fragment (possible start tag case) or treat it according to
  618. # handleOther
  619. if endTag:
  620. data.previous()
  621. self.handleOther()
  622. return True
  623. c = data.skipUntil(spacesAngleBrackets)
  624. if c == b"<":
  625. # return to the first step in the overall "two step" algorithm
  626. # reprocessing the < byte
  627. data.previous()
  628. else:
  629. # Read all attributes
  630. attr = self.getAttribute()
  631. while attr is not None:
  632. attr = self.getAttribute()
  633. return True
  634. def handleOther(self):
  635. return self.data.jumpTo(b">")
  636. def getAttribute(self):
  637. """Return a name,value pair for the next attribute in the stream,
  638. if one is found, or None"""
  639. data = self.data
  640. # Step 1 (skip chars)
  641. c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
  642. assert c is None or len(c) == 1
  643. # Step 2
  644. if c in (b">", None):
  645. return None
  646. # Step 3
  647. attrName = []
  648. attrValue = []
  649. # Step 4 attribute name
  650. while True:
  651. if c == b"=" and attrName:
  652. break
  653. elif c in spaceCharactersBytes:
  654. # Step 6!
  655. c = data.skip()
  656. break
  657. elif c in (b"/", b">"):
  658. return b"".join(attrName), b""
  659. elif c in asciiUppercaseBytes:
  660. attrName.append(c.lower())
  661. elif c is None:
  662. return None
  663. else:
  664. attrName.append(c)
  665. # Step 5
  666. c = next(data)
  667. # Step 7
  668. if c != b"=":
  669. data.previous()
  670. return b"".join(attrName), b""
  671. # Step 8
  672. next(data)
  673. # Step 9
  674. c = data.skip()
  675. # Step 10
  676. if c in (b"'", b'"'):
  677. # 10.1
  678. quoteChar = c
  679. while True:
  680. # 10.2
  681. c = next(data)
  682. # 10.3
  683. if c == quoteChar:
  684. next(data)
  685. return b"".join(attrName), b"".join(attrValue)
  686. # 10.4
  687. elif c in asciiUppercaseBytes:
  688. attrValue.append(c.lower())
  689. # 10.5
  690. else:
  691. attrValue.append(c)
  692. elif c == b">":
  693. return b"".join(attrName), b""
  694. elif c in asciiUppercaseBytes:
  695. attrValue.append(c.lower())
  696. elif c is None:
  697. return None
  698. else:
  699. attrValue.append(c)
  700. # Step 11
  701. while True:
  702. c = next(data)
  703. if c in spacesAngleBrackets:
  704. return b"".join(attrName), b"".join(attrValue)
  705. elif c in asciiUppercaseBytes:
  706. attrValue.append(c.lower())
  707. elif c is None:
  708. return None
  709. else:
  710. attrValue.append(c)
  711. class ContentAttrParser(object):
  712. def __init__(self, data):
  713. assert isinstance(data, bytes)
  714. self.data = data
  715. def parse(self):
  716. try:
  717. # Check if the attr name is charset
  718. # otherwise return
  719. self.data.jumpTo(b"charset")
  720. self.data.position += 1
  721. self.data.skip()
  722. if not self.data.currentByte == b"=":
  723. # If there is no = sign keep looking for attrs
  724. return None
  725. self.data.position += 1
  726. self.data.skip()
  727. # Look for an encoding between matching quote marks
  728. if self.data.currentByte in (b'"', b"'"):
  729. quoteMark = self.data.currentByte
  730. self.data.position += 1
  731. oldPosition = self.data.position
  732. if self.data.jumpTo(quoteMark):
  733. return self.data[oldPosition:self.data.position]
  734. else:
  735. return None
  736. else:
  737. # Unquoted value
  738. oldPosition = self.data.position
  739. try:
  740. self.data.skipUntil(spaceCharactersBytes)
  741. return self.data[oldPosition:self.data.position]
  742. except StopIteration:
  743. # Return the whole remaining value
  744. return self.data[oldPosition:]
  745. except StopIteration:
  746. return None
  747. def codecName(encoding):
  748. """Return the python codec name corresponding to an encoding or None if the
  749. string doesn't correspond to a valid encoding."""
  750. if isinstance(encoding, bytes):
  751. try:
  752. encoding = encoding.decode("ascii")
  753. except UnicodeDecodeError:
  754. return None
  755. if encoding:
  756. canonicalName = ascii_punctuation_re.sub("", encoding).lower()
  757. return encodings.get(canonicalName, None)
  758. else:
  759. return None