|
|
- # -*- coding: utf-8 -*-
- """
- Functions for handling encoding of web pages
- """
- import re, codecs, encodings
- from sys import version_info
-
- _HEADER_ENCODING_RE = re.compile(r'charset=([\w-]+)', re.I)
-
- def http_content_type_encoding(content_type):
- """Extract the encoding in the content-type header
-
- >>> import w3lib.encoding
- >>> w3lib.encoding.http_content_type_encoding("Content-Type: text/html; charset=ISO-8859-4")
- 'iso8859-4'
-
- """
-
- if content_type:
- match = _HEADER_ENCODING_RE.search(content_type)
- if match:
- return resolve_encoding(match.group(1))
-
- # regexp for parsing HTTP meta tags
- _TEMPLATE = r'''%s\s*=\s*["']?\s*%s\s*["']?'''
- _SKIP_ATTRS = '''(?x)(?:\\s+
- [^=<>/\\s"'\x00-\x1f\x7f]+ # Attribute name
- (?:\\s*=\\s*
- (?: # ' and " are entity encoded (', "), so no need for \', \"
- '[^']*' # attr in '
- |
- "[^"]*" # attr in "
- |
- [^'"\\s]+ # attr having no ' nor "
- ))?
- )*?'''
- _HTTPEQUIV_RE = _TEMPLATE % ('http-equiv', 'Content-Type')
- _CONTENT_RE = _TEMPLATE % ('content', r'(?P<mime>[^;]+);\s*charset=(?P<charset>[\w-]+)')
- _CONTENT2_RE = _TEMPLATE % ('charset', r'(?P<charset2>[\w-]+)')
- _XML_ENCODING_RE = _TEMPLATE % ('encoding', r'(?P<xmlcharset>[\w-]+)')
-
- # check for meta tags, or xml decl. and stop search if a body tag is encountered
- _BODY_ENCODING_PATTERN = r'<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)' % (
- _SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
- _BODY_ENCODING_STR_RE = re.compile(_BODY_ENCODING_PATTERN, re.I)
- _BODY_ENCODING_BYTES_RE = re.compile(_BODY_ENCODING_PATTERN.encode('ascii'), re.I)
-
- def html_body_declared_encoding(html_body_str):
- '''Return the encoding specified in meta tags in the html body,
- or ``None`` if no suitable encoding was found
-
- >>> import w3lib.encoding
- >>> w3lib.encoding.html_body_declared_encoding(
- ... """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
- ... "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
- ... <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
- ... <head>
- ... <title>Some title</title>
- ... <meta http-equiv="content-type" content="text/html;charset=utf-8" />
- ... </head>
- ... <body>
- ... ...
- ... </body>
- ... </html>""")
- 'utf-8'
- >>>
-
- '''
-
- # html5 suggests the first 1024 bytes are sufficient, we allow for more
- chunk = html_body_str[:4096]
- if isinstance(chunk, bytes):
- match = _BODY_ENCODING_BYTES_RE.search(chunk)
- else:
- match = _BODY_ENCODING_STR_RE.search(chunk)
-
- if match:
- encoding = match.group('charset') or match.group('charset2') \
- or match.group('xmlcharset')
- if encoding:
- return resolve_encoding(encoding)
-
- # Default encoding translation
- # this maps cannonicalized encodings to target encodings
- # see http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#character-encodings-0
- # in addition, gb18030 supercedes gb2312 & gbk
- # the keys are converted using _c18n_encoding and in sorted order
- DEFAULT_ENCODING_TRANSLATION = {
- 'ascii': 'cp1252',
- 'big5': 'big5hkscs',
- 'euc_kr': 'cp949',
- 'gb2312': 'gb18030',
- 'gb_2312_80': 'gb18030',
- 'gbk': 'gb18030',
- 'iso8859_11': 'cp874',
- 'iso8859_9': 'cp1254',
- 'latin_1': 'cp1252',
- 'macintosh': 'mac_roman',
- 'shift_jis': 'cp932',
- 'tis_620': 'cp874',
- 'win_1251': 'cp1251',
- 'windows_31j': 'cp932',
- 'win_31j': 'cp932',
- 'windows_874': 'cp874',
- 'win_874': 'cp874',
- 'x_sjis': 'cp932',
- 'zh_cn': 'gb18030'
- }
-
- def _c18n_encoding(encoding):
- """Cannonicalize an encoding name
-
- This performs normalization and translates aliases using python's
- encoding aliases
- """
- normed = encodings.normalize_encoding(encoding).lower()
- return encodings.aliases.aliases.get(normed, normed)
-
- def resolve_encoding(encoding_alias):
- """Return the encoding that `encoding_alias` maps to, or ``None``
- if the encoding cannot be interpreted
-
- >>> import w3lib.encoding
- >>> w3lib.encoding.resolve_encoding('latin1')
- 'cp1252'
- >>> w3lib.encoding.resolve_encoding('gb_2312-80')
- 'gb18030'
- >>>
-
- """
- c18n_encoding = _c18n_encoding(encoding_alias)
- translated = DEFAULT_ENCODING_TRANSLATION.get(c18n_encoding, c18n_encoding)
- try:
- return codecs.lookup(translated).name
- except LookupError:
- return None
-
- _BOM_TABLE = [
- (codecs.BOM_UTF32_BE, 'utf-32-be'),
- (codecs.BOM_UTF32_LE, 'utf-32-le'),
- (codecs.BOM_UTF16_BE, 'utf-16-be'),
- (codecs.BOM_UTF16_LE, 'utf-16-le'),
- (codecs.BOM_UTF8, 'utf-8')
- ]
- _FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE)
-
- def read_bom(data):
- r"""Read the byte order mark in the text, if present, and
- return the encoding represented by the BOM and the BOM.
-
- If no BOM can be detected, ``(None, None)`` is returned.
-
- >>> import w3lib.encoding
- >>> w3lib.encoding.read_bom(b'\xfe\xff\x6c\x34')
- ('utf-16-be', '\xfe\xff')
- >>> w3lib.encoding.read_bom(b'\xff\xfe\x34\x6c')
- ('utf-16-le', '\xff\xfe')
- >>> w3lib.encoding.read_bom(b'\x00\x00\xfe\xff\x00\x00\x6c\x34')
- ('utf-32-be', '\x00\x00\xfe\xff')
- >>> w3lib.encoding.read_bom(b'\xff\xfe\x00\x00\x34\x6c\x00\x00')
- ('utf-32-le', '\xff\xfe\x00\x00')
- >>> w3lib.encoding.read_bom(b'\x01\x02\x03\x04')
- (None, None)
- >>>
-
- """
-
- # common case is no BOM, so this is fast
- if data and data[0] in _FIRST_CHARS:
- for bom, encoding in _BOM_TABLE:
- if data.startswith(bom):
- return encoding, bom
- return None, None
-
- # Python decoder doesn't follow unicode standard when handling
- # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
- codecs.register_error('w3lib_replace', lambda exc: (u'\ufffd', exc.end))
-
- def to_unicode(data_str, encoding):
- """Convert a str object to unicode using the encoding given
-
- Characters that cannot be converted will be converted to ``\\ufffd`` (the
- unicode replacement character).
- """
- return data_str.decode(encoding, 'replace' if version_info[0:2] >= (3, 3) else 'w3lib_replace')
-
- def html_to_unicode(content_type_header, html_body_str,
- default_encoding='utf8', auto_detect_fun=None):
- r'''Convert raw html bytes to unicode
-
- This attempts to make a reasonable guess at the content encoding of the
- html body, following a similar process to a web browser.
-
- It will try in order:
-
- * http content type header
- * BOM (byte-order mark)
- * meta or xml tag declarations
- * auto-detection, if the `auto_detect_fun` keyword argument is not ``None``
- * default encoding in keyword arg (which defaults to utf8)
-
- If an encoding other than the auto-detected or default encoding is used,
- overrides will be applied, converting some character encodings to more
- suitable alternatives.
-
- If a BOM is found matching the encoding, it will be stripped.
-
- The `auto_detect_fun` argument can be used to pass a function that will
- sniff the encoding of the text. This function must take the raw text as an
- argument and return the name of an encoding that python can process, or
- None. To use chardet, for example, you can define the function as::
-
- auto_detect_fun=lambda x: chardet.detect(x).get('encoding')
-
- or to use UnicodeDammit (shipped with the BeautifulSoup library)::
-
- auto_detect_fun=lambda x: UnicodeDammit(x).originalEncoding
-
- If the locale of the website or user language preference is known, then a
- better default encoding can be supplied.
-
- If `content_type_header` is not present, ``None`` can be passed signifying
- that the header was not present.
-
- This method will not fail, if characters cannot be converted to unicode,
- ``\\ufffd`` (the unicode replacement character) will be inserted instead.
-
- Returns a tuple of ``(<encoding used>, <unicode_string>)``
-
- Examples:
-
- >>> import w3lib.encoding
- >>> w3lib.encoding.html_to_unicode(None,
- ... b"""<!DOCTYPE html>
- ... <head>
- ... <meta charset="UTF-8" />
- ... <meta name="viewport" content="width=device-width" />
- ... <title>Creative Commons France</title>
- ... <link rel='canonical' href='http://creativecommons.fr/' />
- ... <body>
- ... <p>Creative Commons est une organisation \xc3\xa0 but non lucratif
- ... qui a pour dessein de faciliter la diffusion et le partage des oeuvres
- ... tout en accompagnant les nouvelles pratiques de cr\xc3\xa9ation \xc3\xa0 l\xe2\x80\x99\xc3\xa8re numerique.</p>
- ... </body>
- ... </html>""")
- ('utf-8', u'<!DOCTYPE html>\n<head>\n<meta charset="UTF-8" />\n<meta name="viewport" content="width=device-width" />\n<title>Creative Commons France</title>\n<link rel=\'canonical\' href=\'http://creativecommons.fr/\' />\n<body>\n<p>Creative Commons est une organisation \xe0 but non lucratif\nqui a pour dessein de faciliter la diffusion et le partage des oeuvres\ntout en accompagnant les nouvelles pratiques de cr\xe9ation \xe0 l\u2019\xe8re numerique.</p>\n</body>\n</html>')
- >>>
-
- '''
-
- enc = http_content_type_encoding(content_type_header)
- bom_enc, bom = read_bom(html_body_str)
- if enc is not None:
- # remove BOM if it agrees with the encoding
- if enc == bom_enc:
- html_body_str = html_body_str[len(bom):]
- elif enc == 'utf-16' or enc == 'utf-32':
- # read endianness from BOM, or default to big endian
- # tools.ietf.org/html/rfc2781 section 4.3
- if bom_enc is not None and bom_enc.startswith(enc):
- enc = bom_enc
- html_body_str = html_body_str[len(bom):]
- else:
- enc += '-be'
- return enc, to_unicode(html_body_str, enc)
- if bom_enc is not None:
- return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc)
- enc = html_body_declared_encoding(html_body_str)
- if enc is None and (auto_detect_fun is not None):
- enc = auto_detect_fun(html_body_str)
- if enc is None:
- enc = default_encoding
- return enc, to_unicode(html_body_str, enc)
|