|
|
- # coding: utf-8
- """
-
- webencodings
- ~~~~~~~~~~~~
-
- This is a Python implementation of the `WHATWG Encoding standard
- <http://encoding.spec.whatwg.org/>`. See README for details.
-
- :copyright: Copyright 2012 by Simon Sapin
- :license: BSD, see LICENSE for details.
-
- """
-
- from __future__ import unicode_literals
-
- import codecs
-
- from .labels import LABELS
-
-
- VERSION = '0.5.1'
-
-
- # Some names in Encoding are not valid Python aliases. Remap these.
- PYTHON_NAMES = {
- 'iso-8859-8-i': 'iso-8859-8',
- 'x-mac-cyrillic': 'mac-cyrillic',
- 'macintosh': 'mac-roman',
- 'windows-874': 'cp874'}
-
- CACHE = {}
-
-
- def ascii_lower(string):
- r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.
-
- :param string: An Unicode string.
- :returns: A new Unicode string.
-
- This is used for `ASCII case-insensitive
- <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_
- matching of encoding labels.
- The same matching is also used, among other things,
- for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
-
- This is different from the :meth:`~py:str.lower` method of Unicode strings
- which also affect non-ASCII characters,
- sometimes mapping them into the ASCII range:
-
- >>> keyword = u'Bac\N{KELVIN SIGN}ground'
- >>> assert keyword.lower() == u'background'
- >>> assert ascii_lower(keyword) != keyword.lower()
- >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
-
- """
- # This turns out to be faster than unicode.translate()
- return string.encode('utf8').lower().decode('utf8')
-
-
- def lookup(label):
- """
- Look for an encoding by its label.
- This is the spec’s `get an encoding
- <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.
- Supported labels are listed there.
-
- :param label: A string.
- :returns:
- An :class:`Encoding` object, or :obj:`None` for an unknown label.
-
- """
- # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.
- label = ascii_lower(label.strip('\t\n\f\r '))
- name = LABELS.get(label)
- if name is None:
- return None
- encoding = CACHE.get(name)
- if encoding is None:
- if name == 'x-user-defined':
- from .x_user_defined import codec_info
- else:
- python_name = PYTHON_NAMES.get(name, name)
- # Any python_name value that gets to here should be valid.
- codec_info = codecs.lookup(python_name)
- encoding = Encoding(name, codec_info)
- CACHE[name] = encoding
- return encoding
-
-
- def _get_encoding(encoding_or_label):
- """
- Accept either an encoding object or label.
-
- :param encoding: An :class:`Encoding` object or a label string.
- :returns: An :class:`Encoding` object.
- :raises: :exc:`~exceptions.LookupError` for an unknown label.
-
- """
- if hasattr(encoding_or_label, 'codec_info'):
- return encoding_or_label
-
- encoding = lookup(encoding_or_label)
- if encoding is None:
- raise LookupError('Unknown encoding label: %r' % encoding_or_label)
- return encoding
-
-
- class Encoding(object):
- """Reresents a character encoding such as UTF-8,
- that can be used for decoding or encoding.
-
- .. attribute:: name
-
- Canonical name of the encoding
-
- .. attribute:: codec_info
-
- The actual implementation of the encoding,
- a stdlib :class:`~codecs.CodecInfo` object.
- See :func:`codecs.register`.
-
- """
- def __init__(self, name, codec_info):
- self.name = name
- self.codec_info = codec_info
-
- def __repr__(self):
- return '<Encoding %s>' % self.name
-
-
- #: The UTF-8 encoding. Should be used for new content and formats.
- UTF8 = lookup('utf-8')
-
- _UTF16LE = lookup('utf-16le')
- _UTF16BE = lookup('utf-16be')
-
-
- def decode(input, fallback_encoding, errors='replace'):
- """
- Decode a single string.
-
- :param input: A byte string
- :param fallback_encoding:
- An :class:`Encoding` object or a label string.
- The encoding to use if :obj:`input` does note have a BOM.
- :param errors: Type of error handling. See :func:`codecs.register`.
- :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
- :return:
- A ``(output, encoding)`` tuple of an Unicode string
- and an :obj:`Encoding`.
-
- """
- # Fail early if `encoding` is an invalid label.
- fallback_encoding = _get_encoding(fallback_encoding)
- bom_encoding, input = _detect_bom(input)
- encoding = bom_encoding or fallback_encoding
- return encoding.codec_info.decode(input, errors)[0], encoding
-
-
- def _detect_bom(input):
- """Return (bom_encoding, input), with any BOM removed from the input."""
- if input.startswith(b'\xFF\xFE'):
- return _UTF16LE, input[2:]
- if input.startswith(b'\xFE\xFF'):
- return _UTF16BE, input[2:]
- if input.startswith(b'\xEF\xBB\xBF'):
- return UTF8, input[3:]
- return None, input
-
-
- def encode(input, encoding=UTF8, errors='strict'):
- """
- Encode a single string.
-
- :param input: An Unicode string.
- :param encoding: An :class:`Encoding` object or a label string.
- :param errors: Type of error handling. See :func:`codecs.register`.
- :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
- :return: A byte string.
-
- """
- return _get_encoding(encoding).codec_info.encode(input, errors)[0]
-
-
- def iter_decode(input, fallback_encoding, errors='replace'):
- """
- "Pull"-based decoder.
-
- :param input:
- An iterable of byte strings.
-
- The input is first consumed just enough to determine the encoding
- based on the precense of a BOM,
- then consumed on demand when the return value is.
- :param fallback_encoding:
- An :class:`Encoding` object or a label string.
- The encoding to use if :obj:`input` does note have a BOM.
- :param errors: Type of error handling. See :func:`codecs.register`.
- :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
- :returns:
- An ``(output, encoding)`` tuple.
- :obj:`output` is an iterable of Unicode strings,
- :obj:`encoding` is the :obj:`Encoding` that is being used.
-
- """
-
- decoder = IncrementalDecoder(fallback_encoding, errors)
- generator = _iter_decode_generator(input, decoder)
- encoding = next(generator)
- return generator, encoding
-
-
- def _iter_decode_generator(input, decoder):
- """Return a generator that first yields the :obj:`Encoding`,
- then yields output chukns as Unicode strings.
-
- """
- decode = decoder.decode
- input = iter(input)
- for chunck in input:
- output = decode(chunck)
- if output:
- assert decoder.encoding is not None
- yield decoder.encoding
- yield output
- break
- else:
- # Input exhausted without determining the encoding
- output = decode(b'', final=True)
- assert decoder.encoding is not None
- yield decoder.encoding
- if output:
- yield output
- return
-
- for chunck in input:
- output = decode(chunck)
- if output:
- yield output
- output = decode(b'', final=True)
- if output:
- yield output
-
-
- def iter_encode(input, encoding=UTF8, errors='strict'):
- """
- “Pull”-based encoder.
-
- :param input: An iterable of Unicode strings.
- :param encoding: An :class:`Encoding` object or a label string.
- :param errors: Type of error handling. See :func:`codecs.register`.
- :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
- :returns: An iterable of byte strings.
-
- """
- # Fail early if `encoding` is an invalid label.
- encode = IncrementalEncoder(encoding, errors).encode
- return _iter_encode_generator(input, encode)
-
-
- def _iter_encode_generator(input, encode):
- for chunck in input:
- output = encode(chunck)
- if output:
- yield output
- output = encode('', final=True)
- if output:
- yield output
-
-
- class IncrementalDecoder(object):
- """
- “Push”-based decoder.
-
- :param fallback_encoding:
- An :class:`Encoding` object or a label string.
- The encoding to use if :obj:`input` does note have a BOM.
- :param errors: Type of error handling. See :func:`codecs.register`.
- :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
-
- """
- def __init__(self, fallback_encoding, errors='replace'):
- # Fail early if `encoding` is an invalid label.
- self._fallback_encoding = _get_encoding(fallback_encoding)
- self._errors = errors
- self._buffer = b''
- self._decoder = None
- #: The actual :class:`Encoding` that is being used,
- #: or :obj:`None` if that is not determined yet.
- #: (Ie. if there is not enough input yet to determine
- #: if there is a BOM.)
- self.encoding = None # Not known yet.
-
- def decode(self, input, final=False):
- """Decode one chunk of the input.
-
- :param input: A byte string.
- :param final:
- Indicate that no more input is available.
- Must be :obj:`True` if this is the last call.
- :returns: An Unicode string.
-
- """
- decoder = self._decoder
- if decoder is not None:
- return decoder(input, final)
-
- input = self._buffer + input
- encoding, input = _detect_bom(input)
- if encoding is None:
- if len(input) < 3 and not final: # Not enough data yet.
- self._buffer = input
- return ''
- else: # No BOM
- encoding = self._fallback_encoding
- decoder = encoding.codec_info.incrementaldecoder(self._errors).decode
- self._decoder = decoder
- self.encoding = encoding
- return decoder(input, final)
-
-
- class IncrementalEncoder(object):
- """
- “Push”-based encoder.
-
- :param encoding: An :class:`Encoding` object or a label string.
- :param errors: Type of error handling. See :func:`codecs.register`.
- :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
-
- .. method:: encode(input, final=False)
-
- :param input: An Unicode string.
- :param final:
- Indicate that no more input is available.
- Must be :obj:`True` if this is the last call.
- :returns: A byte string.
-
- """
- def __init__(self, encoding=UTF8, errors='strict'):
- encoding = _get_encoding(encoding)
- self.encode = encoding.codec_info.incrementalencoder(errors).encode
|