You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

842 lines
29 KiB

4 years ago
  1. # -*- coding: utf-8 -*-
  2. """Beautiful Soup bonus library: Unicode, Dammit
  3. This library converts a bytestream to Unicode through any means
  4. necessary. It is heavily based on code from Mark Pilgrim's Universal
  5. Feed Parser. It works best on XML and HTML, but it does not rewrite the
  6. XML or HTML to reflect a new encoding; that's the tree builder's job.
  7. """
  8. # Use of this source code is governed by a BSD-style license that can be
  9. # found in the LICENSE file.
  10. __license__ = "MIT"
  11. import codecs
  12. from html.entities import codepoint2name
  13. import re
  14. import logging
  15. import string
  16. # Import a library to autodetect character encodings.
  17. chardet_type = None
  18. try:
  19. # First try the fast C implementation.
  20. # PyPI package: cchardet
  21. import cchardet
  22. def chardet_dammit(s):
  23. return cchardet.detect(s)['encoding']
  24. except ImportError:
  25. try:
  26. # Fall back to the pure Python implementation
  27. # Debian package: python-chardet
  28. # PyPI package: chardet
  29. import chardet
  30. def chardet_dammit(s):
  31. return chardet.detect(s)['encoding']
  32. #import chardet.constants
  33. #chardet.constants._debug = 1
  34. except ImportError:
  35. # No chardet available.
  36. def chardet_dammit(s):
  37. return None
  38. # Available from http://cjkpython.i18n.org/.
  39. try:
  40. import iconv_codec
  41. except ImportError:
  42. pass
  43. xml_encoding_re = re.compile(
  44. '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
  45. html_meta_re = re.compile(
  46. '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
  47. class EntitySubstitution(object):
  48. """Substitute XML or HTML entities for the corresponding characters."""
  49. def _populate_class_variables():
  50. lookup = {}
  51. reverse_lookup = {}
  52. characters_for_re = []
  53. for codepoint, name in list(codepoint2name.items()):
  54. character = chr(codepoint)
  55. if codepoint != 34:
  56. # There's no point in turning the quotation mark into
  57. # &quot;, unless it happens within an attribute value, which
  58. # is handled elsewhere.
  59. characters_for_re.append(character)
  60. lookup[character] = name
  61. # But we do want to turn &quot; into the quotation mark.
  62. reverse_lookup[name] = character
  63. re_definition = "[%s]" % "".join(characters_for_re)
  64. return lookup, reverse_lookup, re.compile(re_definition)
  65. (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
  66. CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
  67. CHARACTER_TO_XML_ENTITY = {
  68. "'": "apos",
  69. '"': "quot",
  70. "&": "amp",
  71. "<": "lt",
  72. ">": "gt",
  73. }
  74. BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
  75. "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
  76. ")")
  77. AMPERSAND_OR_BRACKET = re.compile("([<>&])")
  78. @classmethod
  79. def _substitute_html_entity(cls, matchobj):
  80. entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
  81. return "&%s;" % entity
  82. @classmethod
  83. def _substitute_xml_entity(cls, matchobj):
  84. """Used with a regular expression to substitute the
  85. appropriate XML entity for an XML special character."""
  86. entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
  87. return "&%s;" % entity
  88. @classmethod
  89. def quoted_attribute_value(self, value):
  90. """Make a value into a quoted XML attribute, possibly escaping it.
  91. Most strings will be quoted using double quotes.
  92. Bob's Bar -> "Bob's Bar"
  93. If a string contains double quotes, it will be quoted using
  94. single quotes.
  95. Welcome to "my bar" -> 'Welcome to "my bar"'
  96. If a string contains both single and double quotes, the
  97. double quotes will be escaped, and the string will be quoted
  98. using double quotes.
  99. Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
  100. """
  101. quote_with = '"'
  102. if '"' in value:
  103. if "'" in value:
  104. # The string contains both single and double
  105. # quotes. Turn the double quotes into
  106. # entities. We quote the double quotes rather than
  107. # the single quotes because the entity name is
  108. # "&quot;" whether this is HTML or XML. If we
  109. # quoted the single quotes, we'd have to decide
  110. # between &apos; and &squot;.
  111. replace_with = "&quot;"
  112. value = value.replace('"', replace_with)
  113. else:
  114. # There are double quotes but no single quotes.
  115. # We can use single quotes to quote the attribute.
  116. quote_with = "'"
  117. return quote_with + value + quote_with
  118. @classmethod
  119. def substitute_xml(cls, value, make_quoted_attribute=False):
  120. """Substitute XML entities for special XML characters.
  121. :param value: A string to be substituted. The less-than sign
  122. will become &lt;, the greater-than sign will become &gt;,
  123. and any ampersands will become &amp;. If you want ampersands
  124. that appear to be part of an entity definition to be left
  125. alone, use substitute_xml_containing_entities() instead.
  126. :param make_quoted_attribute: If True, then the string will be
  127. quoted, as befits an attribute value.
  128. """
  129. # Escape angle brackets and ampersands.
  130. value = cls.AMPERSAND_OR_BRACKET.sub(
  131. cls._substitute_xml_entity, value)
  132. if make_quoted_attribute:
  133. value = cls.quoted_attribute_value(value)
  134. return value
  135. @classmethod
  136. def substitute_xml_containing_entities(
  137. cls, value, make_quoted_attribute=False):
  138. """Substitute XML entities for special XML characters.
  139. :param value: A string to be substituted. The less-than sign will
  140. become &lt;, the greater-than sign will become &gt;, and any
  141. ampersands that are not part of an entity defition will
  142. become &amp;.
  143. :param make_quoted_attribute: If True, then the string will be
  144. quoted, as befits an attribute value.
  145. """
  146. # Escape angle brackets, and ampersands that aren't part of
  147. # entities.
  148. value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
  149. cls._substitute_xml_entity, value)
  150. if make_quoted_attribute:
  151. value = cls.quoted_attribute_value(value)
  152. return value
  153. @classmethod
  154. def substitute_html(cls, s):
  155. """Replace certain Unicode characters with named HTML entities.
  156. This differs from data.encode(encoding, 'xmlcharrefreplace')
  157. in that the goal is to make the result more readable (to those
  158. with ASCII displays) rather than to recover from
  159. errors. There's absolutely nothing wrong with a UTF-8 string
  160. containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
  161. character with "&eacute;" will make it more readable to some
  162. people.
  163. """
  164. return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
  165. cls._substitute_html_entity, s)
  166. class EncodingDetector:
  167. """Suggests a number of possible encodings for a bytestring.
  168. Order of precedence:
  169. 1. Encodings you specifically tell EncodingDetector to try first
  170. (the override_encodings argument to the constructor).
  171. 2. An encoding declared within the bytestring itself, either in an
  172. XML declaration (if the bytestring is to be interpreted as an XML
  173. document), or in a <meta> tag (if the bytestring is to be
  174. interpreted as an HTML document.)
  175. 3. An encoding detected through textual analysis by chardet,
  176. cchardet, or a similar external library.
  177. 4. UTF-8.
  178. 5. Windows-1252.
  179. """
  180. def __init__(self, markup, override_encodings=None, is_html=False,
  181. exclude_encodings=None):
  182. self.override_encodings = override_encodings or []
  183. exclude_encodings = exclude_encodings or []
  184. self.exclude_encodings = set([x.lower() for x in exclude_encodings])
  185. self.chardet_encoding = None
  186. self.is_html = is_html
  187. self.declared_encoding = None
  188. # First order of business: strip a byte-order mark.
  189. self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
  190. def _usable(self, encoding, tried):
  191. if encoding is not None:
  192. encoding = encoding.lower()
  193. if encoding in self.exclude_encodings:
  194. return False
  195. if encoding not in tried:
  196. tried.add(encoding)
  197. return True
  198. return False
  199. @property
  200. def encodings(self):
  201. """Yield a number of encodings that might work for this markup."""
  202. tried = set()
  203. for e in self.override_encodings:
  204. if self._usable(e, tried):
  205. yield e
  206. # Did the document originally start with a byte-order mark
  207. # that indicated its encoding?
  208. if self._usable(self.sniffed_encoding, tried):
  209. yield self.sniffed_encoding
  210. # Look within the document for an XML or HTML encoding
  211. # declaration.
  212. if self.declared_encoding is None:
  213. self.declared_encoding = self.find_declared_encoding(
  214. self.markup, self.is_html)
  215. if self._usable(self.declared_encoding, tried):
  216. yield self.declared_encoding
  217. # Use third-party character set detection to guess at the
  218. # encoding.
  219. if self.chardet_encoding is None:
  220. self.chardet_encoding = chardet_dammit(self.markup)
  221. if self._usable(self.chardet_encoding, tried):
  222. yield self.chardet_encoding
  223. # As a last-ditch effort, try utf-8 and windows-1252.
  224. for e in ('utf-8', 'windows-1252'):
  225. if self._usable(e, tried):
  226. yield e
  227. @classmethod
  228. def strip_byte_order_mark(cls, data):
  229. """If a byte-order mark is present, strip it and return the encoding it implies."""
  230. encoding = None
  231. if isinstance(data, str):
  232. # Unicode data cannot have a byte-order mark.
  233. return data, encoding
  234. if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
  235. and (data[2:4] != '\x00\x00'):
  236. encoding = 'utf-16be'
  237. data = data[2:]
  238. elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
  239. and (data[2:4] != '\x00\x00'):
  240. encoding = 'utf-16le'
  241. data = data[2:]
  242. elif data[:3] == b'\xef\xbb\xbf':
  243. encoding = 'utf-8'
  244. data = data[3:]
  245. elif data[:4] == b'\x00\x00\xfe\xff':
  246. encoding = 'utf-32be'
  247. data = data[4:]
  248. elif data[:4] == b'\xff\xfe\x00\x00':
  249. encoding = 'utf-32le'
  250. data = data[4:]
  251. return data, encoding
  252. @classmethod
  253. def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
  254. """Given a document, tries to find its declared encoding.
  255. An XML encoding is declared at the beginning of the document.
  256. An HTML encoding is declared in a <meta> tag, hopefully near the
  257. beginning of the document.
  258. """
  259. if search_entire_document:
  260. xml_endpos = html_endpos = len(markup)
  261. else:
  262. xml_endpos = 1024
  263. html_endpos = max(2048, int(len(markup) * 0.05))
  264. declared_encoding = None
  265. declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
  266. if not declared_encoding_match and is_html:
  267. declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
  268. if declared_encoding_match is not None:
  269. declared_encoding = declared_encoding_match.groups()[0].decode(
  270. 'ascii', 'replace')
  271. if declared_encoding:
  272. return declared_encoding.lower()
  273. return None
  274. class UnicodeDammit:
  275. """A class for detecting the encoding of a *ML document and
  276. converting it to a Unicode string. If the source encoding is
  277. windows-1252, can replace MS smart quotes with their HTML or XML
  278. equivalents."""
  279. # This dictionary maps commonly seen values for "charset" in HTML
  280. # meta tags to the corresponding Python codec names. It only covers
  281. # values that aren't in Python's aliases and can't be determined
  282. # by the heuristics in find_codec.
  283. CHARSET_ALIASES = {"macintosh": "mac-roman",
  284. "x-sjis": "shift-jis"}
  285. ENCODINGS_WITH_SMART_QUOTES = [
  286. "windows-1252",
  287. "iso-8859-1",
  288. "iso-8859-2",
  289. ]
  290. def __init__(self, markup, override_encodings=[],
  291. smart_quotes_to=None, is_html=False, exclude_encodings=[]):
  292. self.smart_quotes_to = smart_quotes_to
  293. self.tried_encodings = []
  294. self.contains_replacement_characters = False
  295. self.is_html = is_html
  296. self.log = logging.getLogger(__name__)
  297. self.detector = EncodingDetector(
  298. markup, override_encodings, is_html, exclude_encodings)
  299. # Short-circuit if the data is in Unicode to begin with.
  300. if isinstance(markup, str) or markup == '':
  301. self.markup = markup
  302. self.unicode_markup = str(markup)
  303. self.original_encoding = None
  304. return
  305. # The encoding detector may have stripped a byte-order mark.
  306. # Use the stripped markup from this point on.
  307. self.markup = self.detector.markup
  308. u = None
  309. for encoding in self.detector.encodings:
  310. markup = self.detector.markup
  311. u = self._convert_from(encoding)
  312. if u is not None:
  313. break
  314. if not u:
  315. # None of the encodings worked. As an absolute last resort,
  316. # try them again with character replacement.
  317. for encoding in self.detector.encodings:
  318. if encoding != "ascii":
  319. u = self._convert_from(encoding, "replace")
  320. if u is not None:
  321. self.log.warning(
  322. "Some characters could not be decoded, and were "
  323. "replaced with REPLACEMENT CHARACTER."
  324. )
  325. self.contains_replacement_characters = True
  326. break
  327. # If none of that worked, we could at this point force it to
  328. # ASCII, but that would destroy so much data that I think
  329. # giving up is better.
  330. self.unicode_markup = u
  331. if not u:
  332. self.original_encoding = None
  333. def _sub_ms_char(self, match):
  334. """Changes a MS smart quote character to an XML or HTML
  335. entity, or an ASCII character."""
  336. orig = match.group(1)
  337. if self.smart_quotes_to == 'ascii':
  338. sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
  339. else:
  340. sub = self.MS_CHARS.get(orig)
  341. if type(sub) == tuple:
  342. if self.smart_quotes_to == 'xml':
  343. sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
  344. else:
  345. sub = '&'.encode() + sub[0].encode() + ';'.encode()
  346. else:
  347. sub = sub.encode()
  348. return sub
  349. def _convert_from(self, proposed, errors="strict"):
  350. proposed = self.find_codec(proposed)
  351. if not proposed or (proposed, errors) in self.tried_encodings:
  352. return None
  353. self.tried_encodings.append((proposed, errors))
  354. markup = self.markup
  355. # Convert smart quotes to HTML if coming from an encoding
  356. # that might have them.
  357. if (self.smart_quotes_to is not None
  358. and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
  359. smart_quotes_re = b"([\x80-\x9f])"
  360. smart_quotes_compiled = re.compile(smart_quotes_re)
  361. markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
  362. try:
  363. #print "Trying to convert document to %s (errors=%s)" % (
  364. # proposed, errors)
  365. u = self._to_unicode(markup, proposed, errors)
  366. self.markup = u
  367. self.original_encoding = proposed
  368. except Exception as e:
  369. #print "That didn't work!"
  370. #print e
  371. return None
  372. #print "Correct encoding: %s" % proposed
  373. return self.markup
  374. def _to_unicode(self, data, encoding, errors="strict"):
  375. '''Given a string and its encoding, decodes the string into Unicode.
  376. %encoding is a string recognized by encodings.aliases'''
  377. return str(data, encoding, errors)
  378. @property
  379. def declared_html_encoding(self):
  380. if not self.is_html:
  381. return None
  382. return self.detector.declared_encoding
  383. def find_codec(self, charset):
  384. value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
  385. or (charset and self._codec(charset.replace("-", "")))
  386. or (charset and self._codec(charset.replace("-", "_")))
  387. or (charset and charset.lower())
  388. or charset
  389. )
  390. if value:
  391. return value.lower()
  392. return None
  393. def _codec(self, charset):
  394. if not charset:
  395. return charset
  396. codec = None
  397. try:
  398. codecs.lookup(charset)
  399. codec = charset
  400. except (LookupError, ValueError):
  401. pass
  402. return codec
  403. # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
  404. MS_CHARS = {b'\x80': ('euro', '20AC'),
  405. b'\x81': ' ',
  406. b'\x82': ('sbquo', '201A'),
  407. b'\x83': ('fnof', '192'),
  408. b'\x84': ('bdquo', '201E'),
  409. b'\x85': ('hellip', '2026'),
  410. b'\x86': ('dagger', '2020'),
  411. b'\x87': ('Dagger', '2021'),
  412. b'\x88': ('circ', '2C6'),
  413. b'\x89': ('permil', '2030'),
  414. b'\x8A': ('Scaron', '160'),
  415. b'\x8B': ('lsaquo', '2039'),
  416. b'\x8C': ('OElig', '152'),
  417. b'\x8D': '?',
  418. b'\x8E': ('#x17D', '17D'),
  419. b'\x8F': '?',
  420. b'\x90': '?',
  421. b'\x91': ('lsquo', '2018'),
  422. b'\x92': ('rsquo', '2019'),
  423. b'\x93': ('ldquo', '201C'),
  424. b'\x94': ('rdquo', '201D'),
  425. b'\x95': ('bull', '2022'),
  426. b'\x96': ('ndash', '2013'),
  427. b'\x97': ('mdash', '2014'),
  428. b'\x98': ('tilde', '2DC'),
  429. b'\x99': ('trade', '2122'),
  430. b'\x9a': ('scaron', '161'),
  431. b'\x9b': ('rsaquo', '203A'),
  432. b'\x9c': ('oelig', '153'),
  433. b'\x9d': '?',
  434. b'\x9e': ('#x17E', '17E'),
  435. b'\x9f': ('Yuml', ''),}
  436. # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
  437. # horrors like stripping diacritical marks to turn á into a, but also
  438. # contains non-horrors like turning “ into ".
  439. MS_CHARS_TO_ASCII = {
  440. b'\x80' : 'EUR',
  441. b'\x81' : ' ',
  442. b'\x82' : ',',
  443. b'\x83' : 'f',
  444. b'\x84' : ',,',
  445. b'\x85' : '...',
  446. b'\x86' : '+',
  447. b'\x87' : '++',
  448. b'\x88' : '^',
  449. b'\x89' : '%',
  450. b'\x8a' : 'S',
  451. b'\x8b' : '<',
  452. b'\x8c' : 'OE',
  453. b'\x8d' : '?',
  454. b'\x8e' : 'Z',
  455. b'\x8f' : '?',
  456. b'\x90' : '?',
  457. b'\x91' : "'",
  458. b'\x92' : "'",
  459. b'\x93' : '"',
  460. b'\x94' : '"',
  461. b'\x95' : '*',
  462. b'\x96' : '-',
  463. b'\x97' : '--',
  464. b'\x98' : '~',
  465. b'\x99' : '(TM)',
  466. b'\x9a' : 's',
  467. b'\x9b' : '>',
  468. b'\x9c' : 'oe',
  469. b'\x9d' : '?',
  470. b'\x9e' : 'z',
  471. b'\x9f' : 'Y',
  472. b'\xa0' : ' ',
  473. b'\xa1' : '!',
  474. b'\xa2' : 'c',
  475. b'\xa3' : 'GBP',
  476. b'\xa4' : '$', #This approximation is especially parochial--this is the
  477. #generic currency symbol.
  478. b'\xa5' : 'YEN',
  479. b'\xa6' : '|',
  480. b'\xa7' : 'S',
  481. b'\xa8' : '..',
  482. b'\xa9' : '',
  483. b'\xaa' : '(th)',
  484. b'\xab' : '<<',
  485. b'\xac' : '!',
  486. b'\xad' : ' ',
  487. b'\xae' : '(R)',
  488. b'\xaf' : '-',
  489. b'\xb0' : 'o',
  490. b'\xb1' : '+-',
  491. b'\xb2' : '2',
  492. b'\xb3' : '3',
  493. b'\xb4' : ("'", 'acute'),
  494. b'\xb5' : 'u',
  495. b'\xb6' : 'P',
  496. b'\xb7' : '*',
  497. b'\xb8' : ',',
  498. b'\xb9' : '1',
  499. b'\xba' : '(th)',
  500. b'\xbb' : '>>',
  501. b'\xbc' : '1/4',
  502. b'\xbd' : '1/2',
  503. b'\xbe' : '3/4',
  504. b'\xbf' : '?',
  505. b'\xc0' : 'A',
  506. b'\xc1' : 'A',
  507. b'\xc2' : 'A',
  508. b'\xc3' : 'A',
  509. b'\xc4' : 'A',
  510. b'\xc5' : 'A',
  511. b'\xc6' : 'AE',
  512. b'\xc7' : 'C',
  513. b'\xc8' : 'E',
  514. b'\xc9' : 'E',
  515. b'\xca' : 'E',
  516. b'\xcb' : 'E',
  517. b'\xcc' : 'I',
  518. b'\xcd' : 'I',
  519. b'\xce' : 'I',
  520. b'\xcf' : 'I',
  521. b'\xd0' : 'D',
  522. b'\xd1' : 'N',
  523. b'\xd2' : 'O',
  524. b'\xd3' : 'O',
  525. b'\xd4' : 'O',
  526. b'\xd5' : 'O',
  527. b'\xd6' : 'O',
  528. b'\xd7' : '*',
  529. b'\xd8' : 'O',
  530. b'\xd9' : 'U',
  531. b'\xda' : 'U',
  532. b'\xdb' : 'U',
  533. b'\xdc' : 'U',
  534. b'\xdd' : 'Y',
  535. b'\xde' : 'b',
  536. b'\xdf' : 'B',
  537. b'\xe0' : 'a',
  538. b'\xe1' : 'a',
  539. b'\xe2' : 'a',
  540. b'\xe3' : 'a',
  541. b'\xe4' : 'a',
  542. b'\xe5' : 'a',
  543. b'\xe6' : 'ae',
  544. b'\xe7' : 'c',
  545. b'\xe8' : 'e',
  546. b'\xe9' : 'e',
  547. b'\xea' : 'e',
  548. b'\xeb' : 'e',
  549. b'\xec' : 'i',
  550. b'\xed' : 'i',
  551. b'\xee' : 'i',
  552. b'\xef' : 'i',
  553. b'\xf0' : 'o',
  554. b'\xf1' : 'n',
  555. b'\xf2' : 'o',
  556. b'\xf3' : 'o',
  557. b'\xf4' : 'o',
  558. b'\xf5' : 'o',
  559. b'\xf6' : 'o',
  560. b'\xf7' : '/',
  561. b'\xf8' : 'o',
  562. b'\xf9' : 'u',
  563. b'\xfa' : 'u',
  564. b'\xfb' : 'u',
  565. b'\xfc' : 'u',
  566. b'\xfd' : 'y',
  567. b'\xfe' : 'b',
  568. b'\xff' : 'y',
  569. }
  570. # A map used when removing rogue Windows-1252/ISO-8859-1
  571. # characters in otherwise UTF-8 documents.
  572. #
  573. # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
  574. # Windows-1252.
  575. WINDOWS_1252_TO_UTF8 = {
  576. 0x80 : b'\xe2\x82\xac', # €
  577. 0x82 : b'\xe2\x80\x9a', # ‚
  578. 0x83 : b'\xc6\x92', # ƒ
  579. 0x84 : b'\xe2\x80\x9e', # „
  580. 0x85 : b'\xe2\x80\xa6', # …
  581. 0x86 : b'\xe2\x80\xa0', # †
  582. 0x87 : b'\xe2\x80\xa1', # ‡
  583. 0x88 : b'\xcb\x86', # ˆ
  584. 0x89 : b'\xe2\x80\xb0', # ‰
  585. 0x8a : b'\xc5\xa0', # Š
  586. 0x8b : b'\xe2\x80\xb9', # ‹
  587. 0x8c : b'\xc5\x92', # Œ
  588. 0x8e : b'\xc5\xbd', # Ž
  589. 0x91 : b'\xe2\x80\x98', # ‘
  590. 0x92 : b'\xe2\x80\x99', # ’
  591. 0x93 : b'\xe2\x80\x9c', # “
  592. 0x94 : b'\xe2\x80\x9d', # ”
  593. 0x95 : b'\xe2\x80\xa2', # •
  594. 0x96 : b'\xe2\x80\x93', # –
  595. 0x97 : b'\xe2\x80\x94', # —
  596. 0x98 : b'\xcb\x9c', # ˜
  597. 0x99 : b'\xe2\x84\xa2', # ™
  598. 0x9a : b'\xc5\xa1', # š
  599. 0x9b : b'\xe2\x80\xba', # ›
  600. 0x9c : b'\xc5\x93', # œ
  601. 0x9e : b'\xc5\xbe', # ž
  602. 0x9f : b'\xc5\xb8', # Ÿ
  603. 0xa0 : b'\xc2\xa0', #  
  604. 0xa1 : b'\xc2\xa1', # ¡
  605. 0xa2 : b'\xc2\xa2', # ¢
  606. 0xa3 : b'\xc2\xa3', # £
  607. 0xa4 : b'\xc2\xa4', # ¤
  608. 0xa5 : b'\xc2\xa5', # ¥
  609. 0xa6 : b'\xc2\xa6', # ¦
  610. 0xa7 : b'\xc2\xa7', # §
  611. 0xa8 : b'\xc2\xa8', # ¨
  612. 0xa9 : b'\xc2\xa9', # ©
  613. 0xaa : b'\xc2\xaa', # ª
  614. 0xab : b'\xc2\xab', # «
  615. 0xac : b'\xc2\xac', # ¬
  616. 0xad : b'\xc2\xad', # ­
  617. 0xae : b'\xc2\xae', # ®
  618. 0xaf : b'\xc2\xaf', # ¯
  619. 0xb0 : b'\xc2\xb0', # °
  620. 0xb1 : b'\xc2\xb1', # ±
  621. 0xb2 : b'\xc2\xb2', # ²
  622. 0xb3 : b'\xc2\xb3', # ³
  623. 0xb4 : b'\xc2\xb4', # ´
  624. 0xb5 : b'\xc2\xb5', # µ
  625. 0xb6 : b'\xc2\xb6', # ¶
  626. 0xb7 : b'\xc2\xb7', # ·
  627. 0xb8 : b'\xc2\xb8', # ¸
  628. 0xb9 : b'\xc2\xb9', # ¹
  629. 0xba : b'\xc2\xba', # º
  630. 0xbb : b'\xc2\xbb', # »
  631. 0xbc : b'\xc2\xbc', # ¼
  632. 0xbd : b'\xc2\xbd', # ½
  633. 0xbe : b'\xc2\xbe', # ¾
  634. 0xbf : b'\xc2\xbf', # ¿
  635. 0xc0 : b'\xc3\x80', # À
  636. 0xc1 : b'\xc3\x81', # Á
  637. 0xc2 : b'\xc3\x82', # Â
  638. 0xc3 : b'\xc3\x83', # Ã
  639. 0xc4 : b'\xc3\x84', # Ä
  640. 0xc5 : b'\xc3\x85', # Å
  641. 0xc6 : b'\xc3\x86', # Æ
  642. 0xc7 : b'\xc3\x87', # Ç
  643. 0xc8 : b'\xc3\x88', # È
  644. 0xc9 : b'\xc3\x89', # É
  645. 0xca : b'\xc3\x8a', # Ê
  646. 0xcb : b'\xc3\x8b', # Ë
  647. 0xcc : b'\xc3\x8c', # Ì
  648. 0xcd : b'\xc3\x8d', # Í
  649. 0xce : b'\xc3\x8e', # Î
  650. 0xcf : b'\xc3\x8f', # Ï
  651. 0xd0 : b'\xc3\x90', # Ð
  652. 0xd1 : b'\xc3\x91', # Ñ
  653. 0xd2 : b'\xc3\x92', # Ò
  654. 0xd3 : b'\xc3\x93', # Ó
  655. 0xd4 : b'\xc3\x94', # Ô
  656. 0xd5 : b'\xc3\x95', # Õ
  657. 0xd6 : b'\xc3\x96', # Ö
  658. 0xd7 : b'\xc3\x97', # ×
  659. 0xd8 : b'\xc3\x98', # Ø
  660. 0xd9 : b'\xc3\x99', # Ù
  661. 0xda : b'\xc3\x9a', # Ú
  662. 0xdb : b'\xc3\x9b', # Û
  663. 0xdc : b'\xc3\x9c', # Ü
  664. 0xdd : b'\xc3\x9d', # Ý
  665. 0xde : b'\xc3\x9e', # Þ
  666. 0xdf : b'\xc3\x9f', # ß
  667. 0xe0 : b'\xc3\xa0', # à
  668. 0xe1 : b'\xa1', # á
  669. 0xe2 : b'\xc3\xa2', # â
  670. 0xe3 : b'\xc3\xa3', # ã
  671. 0xe4 : b'\xc3\xa4', # ä
  672. 0xe5 : b'\xc3\xa5', # å
  673. 0xe6 : b'\xc3\xa6', # æ
  674. 0xe7 : b'\xc3\xa7', # ç
  675. 0xe8 : b'\xc3\xa8', # è
  676. 0xe9 : b'\xc3\xa9', # é
  677. 0xea : b'\xc3\xaa', # ê
  678. 0xeb : b'\xc3\xab', # ë
  679. 0xec : b'\xc3\xac', # ì
  680. 0xed : b'\xc3\xad', # í
  681. 0xee : b'\xc3\xae', # î
  682. 0xef : b'\xc3\xaf', # ï
  683. 0xf0 : b'\xc3\xb0', # ð
  684. 0xf1 : b'\xc3\xb1', # ñ
  685. 0xf2 : b'\xc3\xb2', # ò
  686. 0xf3 : b'\xc3\xb3', # ó
  687. 0xf4 : b'\xc3\xb4', # ô
  688. 0xf5 : b'\xc3\xb5', # õ
  689. 0xf6 : b'\xc3\xb6', # ö
  690. 0xf7 : b'\xc3\xb7', # ÷
  691. 0xf8 : b'\xc3\xb8', # ø
  692. 0xf9 : b'\xc3\xb9', # ù
  693. 0xfa : b'\xc3\xba', # ú
  694. 0xfb : b'\xc3\xbb', # û
  695. 0xfc : b'\xc3\xbc', # ü
  696. 0xfd : b'\xc3\xbd', # ý
  697. 0xfe : b'\xc3\xbe', # þ
  698. }
  699. MULTIBYTE_MARKERS_AND_SIZES = [
  700. (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
  701. (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
  702. (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
  703. ]
  704. FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
  705. LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
  706. @classmethod
  707. def detwingle(cls, in_bytes, main_encoding="utf8",
  708. embedded_encoding="windows-1252"):
  709. """Fix characters from one encoding embedded in some other encoding.
  710. Currently the only situation supported is Windows-1252 (or its
  711. subset ISO-8859-1), embedded in UTF-8.
  712. The input must be a bytestring. If you've already converted
  713. the document to Unicode, you're too late.
  714. The output is a bytestring in which `embedded_encoding`
  715. characters have been converted to their `main_encoding`
  716. equivalents.
  717. """
  718. if embedded_encoding.replace('_', '-').lower() not in (
  719. 'windows-1252', 'windows_1252'):
  720. raise NotImplementedError(
  721. "Windows-1252 and ISO-8859-1 are the only currently supported "
  722. "embedded encodings.")
  723. if main_encoding.lower() not in ('utf8', 'utf-8'):
  724. raise NotImplementedError(
  725. "UTF-8 is the only currently supported main encoding.")
  726. byte_chunks = []
  727. chunk_start = 0
  728. pos = 0
  729. while pos < len(in_bytes):
  730. byte = in_bytes[pos]
  731. if not isinstance(byte, int):
  732. # Python 2.x
  733. byte = ord(byte)
  734. if (byte >= cls.FIRST_MULTIBYTE_MARKER
  735. and byte <= cls.LAST_MULTIBYTE_MARKER):
  736. # This is the start of a UTF-8 multibyte character. Skip
  737. # to the end.
  738. for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
  739. if byte >= start and byte <= end:
  740. pos += size
  741. break
  742. elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
  743. # We found a Windows-1252 character!
  744. # Save the string up to this point as a chunk.
  745. byte_chunks.append(in_bytes[chunk_start:pos])
  746. # Now translate the Windows-1252 character into UTF-8
  747. # and add it as another, one-byte chunk.
  748. byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
  749. pos += 1
  750. chunk_start = pos
  751. else:
  752. # Go on to the next character.
  753. pos += 1
  754. if chunk_start == 0:
  755. # The string is unchanged.
  756. return in_bytes
  757. else:
  758. # Store the final chunk.
  759. byte_chunks.append(in_bytes[chunk_start:])
  760. return b''.join(byte_chunks)