alpcentaur
/
brieftaube

# coding: utf-8"""

    webencodings    ~~~~~~~~~~~~
    This is a Python implementation of the `WHATWG Encoding standard    <http://encoding.spec.whatwg.org/>`. See README for details.
    :copyright: Copyright 2012 by Simon Sapin    :license: BSD, see LICENSE for details.
"""

from __future__ import unicode_literals
import codecs
from .labels import LABELS

VERSION = '0.5.1'

# Some names in Encoding are not valid Python aliases. Remap these.PYTHON_NAMES = {    'iso-8859-8-i': 'iso-8859-8',    'x-mac-cyrillic': 'mac-cyrillic',    'macintosh': 'mac-roman',    'windows-874': 'cp874'}
CACHE = {}

def ascii_lower(string):    r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z.

    :param string: An Unicode string.    :returns: A new Unicode string.
    This is used for `ASCII case-insensitive    <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_    matching of encoding labels.    The same matching is also used, among other things,    for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_.
    This is different from the :meth:`~py:str.lower` method of Unicode strings    which also affect non-ASCII characters,    sometimes mapping them into the ASCII range:
        >>> keyword = u'Bac\N{KELVIN SIGN}ground'        >>> assert keyword.lower() == u'background'        >>> assert ascii_lower(keyword) != keyword.lower()        >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground'
    """
    # This turns out to be faster than unicode.translate()    return string.encode('utf8').lower().decode('utf8')

def lookup(label):    """
    Look for an encoding by its label.    This is the spec’s `get an encoding    <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm.    Supported labels are listed there.
    :param label: A string.    :returns:        An :class:`Encoding` object, or :obj:`None` for an unknown label.
    """
    # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020.    label = ascii_lower(label.strip('\t\n\f\r '))    name = LABELS.get(label)    if name is None:        return None    encoding = CACHE.get(name)    if encoding is None:        if name == 'x-user-defined':            from .x_user_defined import codec_info        else:            python_name = PYTHON_NAMES.get(name, name)            # Any python_name value that gets to here should be valid.            codec_info = codecs.lookup(python_name)        encoding = Encoding(name, codec_info)        CACHE[name] = encoding    return encoding

def _get_encoding(encoding_or_label):    """
    Accept either an encoding object or label.
    :param encoding: An :class:`Encoding` object or a label string.    :returns: An :class:`Encoding` object.    :raises: :exc:`~exceptions.LookupError` for an unknown label.
    """
    if hasattr(encoding_or_label, 'codec_info'):        return encoding_or_label
    encoding = lookup(encoding_or_label)    if encoding is None:        raise LookupError('Unknown encoding label: %r' % encoding_or_label)    return encoding

class Encoding(object):    """Reresents a character encoding such as UTF-8,
    that can be used for decoding or encoding.
    .. attribute:: name
        Canonical name of the encoding
    .. attribute:: codec_info
        The actual implementation of the encoding,        a stdlib :class:`~codecs.CodecInfo` object.        See :func:`codecs.register`.
    """
    def __init__(self, name, codec_info):        self.name = name        self.codec_info = codec_info
    def __repr__(self):        return '<Encoding %s>' % self.name

#: The UTF-8 encoding. Should be used for new content and formats.UTF8 = lookup('utf-8')
_UTF16LE = lookup('utf-16le')_UTF16BE = lookup('utf-16be')

def decode(input, fallback_encoding, errors='replace'):    """
    Decode a single string.
    :param input: A byte string    :param fallback_encoding:        An :class:`Encoding` object or a label string.        The encoding to use if :obj:`input` does note have a BOM.    :param errors: Type of error handling. See :func:`codecs.register`.    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.    :return:        A ``(output, encoding)`` tuple of an Unicode string        and an :obj:`Encoding`.
    """
    # Fail early if `encoding` is an invalid label.    fallback_encoding = _get_encoding(fallback_encoding)    bom_encoding, input = _detect_bom(input)    encoding = bom_encoding or fallback_encoding    return encoding.codec_info.decode(input, errors)[0], encoding

def _detect_bom(input):    """Return (bom_encoding, input), with any BOM removed from the input."""    if input.startswith(b'\xFF\xFE'):        return _UTF16LE, input[2:]    if input.startswith(b'\xFE\xFF'):        return _UTF16BE, input[2:]    if input.startswith(b'\xEF\xBB\xBF'):        return UTF8, input[3:]    return None, input

def encode(input, encoding=UTF8, errors='strict'):    """
    Encode a single string.
    :param input: An Unicode string.    :param encoding: An :class:`Encoding` object or a label string.    :param errors: Type of error handling. See :func:`codecs.register`.    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.    :return: A byte string.
    """
    return _get_encoding(encoding).codec_info.encode(input, errors)[0]

def iter_decode(input, fallback_encoding, errors='replace'):    """
    "Pull"-based decoder.
    :param input:        An iterable of byte strings.
        The input is first consumed just enough to determine the encoding        based on the precense of a BOM,        then consumed on demand when the return value is.    :param fallback_encoding:        An :class:`Encoding` object or a label string.        The encoding to use if :obj:`input` does note have a BOM.    :param errors: Type of error handling. See :func:`codecs.register`.    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.    :returns:        An ``(output, encoding)`` tuple.        :obj:`output` is an iterable of Unicode strings,        :obj:`encoding` is the :obj:`Encoding` that is being used.
    """

    decoder = IncrementalDecoder(fallback_encoding, errors)    generator = _iter_decode_generator(input, decoder)    encoding = next(generator)    return generator, encoding

def _iter_decode_generator(input, decoder):    """Return a generator that first yields the :obj:`Encoding`,
    then yields output chukns as Unicode strings.
    """
    decode = decoder.decode    input = iter(input)    for chunck in input:        output = decode(chunck)        if output:            assert decoder.encoding is not None            yield decoder.encoding            yield output            break    else:        # Input exhausted without determining the encoding        output = decode(b'', final=True)        assert decoder.encoding is not None        yield decoder.encoding        if output:            yield output        return
    for chunck in input:        output = decode(chunck)        if output:            yield output    output = decode(b'', final=True)    if output:        yield output

def iter_encode(input, encoding=UTF8, errors='strict'):    """
    “Pull”-based encoder.
    :param input: An iterable of Unicode strings.    :param encoding: An :class:`Encoding` object or a label string.    :param errors: Type of error handling. See :func:`codecs.register`.    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.    :returns: An iterable of byte strings.
    """
    # Fail early if `encoding` is an invalid label.    encode = IncrementalEncoder(encoding, errors).encode    return _iter_encode_generator(input, encode)

def _iter_encode_generator(input, encode):    for chunck in input:        output = encode(chunck)        if output:            yield output    output = encode('', final=True)    if output:        yield output

class IncrementalDecoder(object):    """
    “Push”-based decoder.
    :param fallback_encoding:        An :class:`Encoding` object or a label string.        The encoding to use if :obj:`input` does note have a BOM.    :param errors: Type of error handling. See :func:`codecs.register`.    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    """
    def __init__(self, fallback_encoding, errors='replace'):        # Fail early if `encoding` is an invalid label.        self._fallback_encoding = _get_encoding(fallback_encoding)        self._errors = errors        self._buffer = b''        self._decoder = None        #: The actual :class:`Encoding` that is being used,        #: or :obj:`None` if that is not determined yet.        #: (Ie. if there is not enough input yet to determine        #: if there is a BOM.)        self.encoding = None  # Not known yet.
    def decode(self, input, final=False):        """Decode one chunk of the input.

        :param input: A byte string.        :param final:            Indicate that no more input is available.            Must be :obj:`True` if this is the last call.        :returns: An Unicode string.
        """
        decoder = self._decoder        if decoder is not None:            return decoder(input, final)
        input = self._buffer + input        encoding, input = _detect_bom(input)        if encoding is None:            if len(input) < 3 and not final:  # Not enough data yet.                self._buffer = input                return ''            else:  # No BOM                encoding = self._fallback_encoding        decoder = encoding.codec_info.incrementaldecoder(self._errors).decode        self._decoder = decoder        self.encoding = encoding        return decoder(input, final)

class IncrementalEncoder(object):    """
    “Push”-based encoder.
    :param encoding: An :class:`Encoding` object or a label string.    :param errors: Type of error handling. See :func:`codecs.register`.    :raises: :exc:`~exceptions.LookupError` for an unknown encoding label.
    .. method:: encode(input, final=False)
        :param input: An Unicode string.        :param final:            Indicate that no more input is available.            Must be :obj:`True` if this is the last call.        :returns: A byte string.
    """
    def __init__(self, encoding=UTF8, errors='strict'):        encoding = _get_encoding(encoding)        self.encode = encoding.codec_info.incrementalencoder(errors).encode