You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

33 lines
1.0 KiB

4 years ago
  1. import codecs
  2. import locale
  3. import re
  4. import sys
  5. BOMS = [
  6. (codecs.BOM_UTF8, 'utf8'),
  7. (codecs.BOM_UTF16, 'utf16'),
  8. (codecs.BOM_UTF16_BE, 'utf16-be'),
  9. (codecs.BOM_UTF16_LE, 'utf16-le'),
  10. (codecs.BOM_UTF32, 'utf32'),
  11. (codecs.BOM_UTF32_BE, 'utf32-be'),
  12. (codecs.BOM_UTF32_LE, 'utf32-le'),
  13. ]
  14. ENCODING_RE = re.compile(br'coding[:=]\s*([-\w.]+)')
  15. def auto_decode(data):
  16. """Check a bytes string for a BOM to correctly detect the encoding
  17. Fallback to locale.getpreferredencoding(False) like open() on Python3"""
  18. for bom, encoding in BOMS:
  19. if data.startswith(bom):
  20. return data[len(bom):].decode(encoding)
  21. # Lets check the first two lines as in PEP263
  22. for line in data.split(b'\n')[:2]:
  23. if line[0:1] == b'#' and ENCODING_RE.search(line):
  24. encoding = ENCODING_RE.search(line).groups()[0].decode('ascii')
  25. return data.decode(encoding)
  26. return data.decode(
  27. locale.getpreferredencoding(False) or sys.getdefaultencoding(),
  28. )