|
|
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Compatibility
- #
- # Copyright (C) 2001-2019 NLTK Project
- #
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
-
- from __future__ import absolute_import, print_function
- import os
- import sys
- from functools import update_wrapper, wraps
- import fractions
- import unicodedata
-
- from six import string_types, text_type
-
- # Python 2/3 compatibility layer. Based on six.
-
- PY3 = sys.version_info[0] == 3
-
- if PY3:
-
- def get_im_class(meth):
- return meth.__self__.__class__
-
- import io
-
- StringIO = io.StringIO
- BytesIO = io.BytesIO
-
- from datetime import timezone
-
- UTC = timezone.utc
-
- from tempfile import TemporaryDirectory
-
- else:
-
- def get_im_class(meth):
- return meth.im_class
-
- try:
- from cStringIO import StringIO
- except ImportError:
- from StringIO import StringIO
- BytesIO = StringIO
-
- from datetime import tzinfo, timedelta
-
- ZERO = timedelta(0)
- HOUR = timedelta(hours=1)
-
- # A UTC class for python 2.7
- class UTC(tzinfo):
- """UTC"""
-
- def utcoffset(self, dt):
- return ZERO
-
- def tzname(self, dt):
- return "UTC"
-
- def dst(self, dt):
- return ZERO
-
- UTC = UTC()
-
- import csv
- import codecs
- import cStringIO
-
- class UnicodeWriter:
- """
- A CSV writer which will write rows to CSV file "f",
- which is encoded in the given encoding.
- see https://docs.python.org/2/library/csv.html
- """
-
- def __init__(
- self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds
- ):
- # Redirect output to a queue
- self.queue = cStringIO.StringIO()
- self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
- self.stream = f
- encoder_cls = codecs.getincrementalencoder(encoding)
- self.encoder = encoder_cls(errors=errors)
-
- def encode(self, data):
- if isinstance(data, string_types):
- return data.encode("utf-8")
- else:
- return data
-
- def writerow(self, row):
- self.writer.writerow([self.encode(s) for s in row])
- # Fetch UTF-8 output from the queue ...
- data = self.queue.getvalue()
- data = data.decode("utf-8")
- # ... and reencode it into the target encoding
- data = self.encoder.encode(data, 'replace')
- # write to the target stream
- self.stream.write(data)
- # empty queue
- self.queue.truncate(0)
-
- import warnings as _warnings
- import os as _os
- from tempfile import mkdtemp
-
- class TemporaryDirectory(object):
- """Create and return a temporary directory. This has the same
- behavior as mkdtemp but can be used as a context manager. For
- example:
-
- with TemporaryDirectory() as tmpdir:
- ...
-
- Upon exiting the context, the directory and everything contained
- in it are removed.
-
- http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
- """
-
- def __init__(self, suffix="", prefix="tmp", dir=None):
- self._closed = False
- self.name = None # Handle mkdtemp raising an exception
- self.name = mkdtemp(suffix, prefix, dir)
-
- def __repr__(self):
- return "<{} {!r}>".format(self.__class__.__name__, self.name)
-
- def __enter__(self):
- return self.name
-
- def cleanup(self, _warn=False):
- if self.name and not self._closed:
- try:
- self._rmtree(self.name)
- except (TypeError, AttributeError) as ex:
- # Issue #10188: Emit a warning on stderr
- # if the directory could not be cleaned
- # up due to missing globals
- if "None" not in str(ex):
- raise
- print(
- "ERROR: {!r} while cleaning up {!r}".format(ex, self),
- file=sys.stderr,
- )
- return
- self._closed = True
- if _warn:
- self._warn("Implicitly cleaning up {!r}".format(self), Warning)
-
- def __exit__(self, exc, value, tb):
- self.cleanup()
-
- def __del__(self):
- # Issue a Warning if implicit cleanup needed
- self.cleanup(_warn=True)
-
- # XXX (ncoghlan): The following code attempts to make
- # this class tolerant of the module nulling out process
- # that happens during CPython interpreter shutdown
- # Alas, it doesn't actually manage it. See issue #10188
- _listdir = staticmethod(_os.listdir)
- _path_join = staticmethod(_os.path.join)
- _isdir = staticmethod(_os.path.isdir)
- _islink = staticmethod(_os.path.islink)
- _remove = staticmethod(_os.remove)
- _rmdir = staticmethod(_os.rmdir)
- _warn = _warnings.warn
-
- def _rmtree(self, path):
- # Essentially a stripped down version of shutil.rmtree. We can't
- # use globals because they may be None'ed out at shutdown.
- for name in self._listdir(path):
- fullname = self._path_join(path, name)
- try:
- isdir = self._isdir(fullname) and not self._islink(fullname)
- except OSError:
- isdir = False
- if isdir:
- self._rmtree(fullname)
- else:
- try:
- self._remove(fullname)
- except OSError:
- pass
- try:
- self._rmdir(path)
- except OSError:
- pass
-
-
- # ======= Compatibility for datasets that care about Python versions ========
-
- # The following datasets have a /PY3 subdirectory containing
- # a full copy of the data which has been re-encoded or repickled.
- DATA_UPDATES = [
- ("chunkers", "maxent_ne_chunker"),
- ("help", "tagsets"),
- ("taggers", "maxent_treebank_pos_tagger"),
- ("tokenizers", "punkt"),
- ]
-
- _PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
-
-
- def add_py3_data(path):
- if PY3:
- for item in _PY3_DATA_UPDATES:
- if item in str(path) and "/PY3" not in str(path):
- pos = path.index(item) + len(item)
- if path[pos : pos + 4] == ".zip":
- pos += 4
- path = path[:pos] + "/PY3" + path[pos:]
- break
- return path
-
-
- # for use in adding /PY3 to the second (filename) argument
- # of the file pointers in data.py
- def py3_data(init_func):
- def _decorator(*args, **kwargs):
- args = (args[0], add_py3_data(args[1])) + args[2:]
- return init_func(*args, **kwargs)
-
- return wraps(init_func)(_decorator)
-
-
- # ======= Compatibility layer for __str__ and __repr__ ==========
- def remove_accents(text):
-
- if isinstance(text, bytes):
- text = text.decode('ascii')
-
- category = unicodedata.category # this gives a small (~10%) speedup
- return ''.join(
- c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
- )
-
-
- # Select the best transliteration method:
- try:
- # Older versions of Unidecode are licensed under Artistic License;
- # assume an older version is installed.
- from unidecode import unidecode as transliterate
- except ImportError:
- try:
- # text-unidecode implementation is worse than Unidecode
- # implementation so Unidecode is preferred.
- from text_unidecode import unidecode as transliterate
- except ImportError:
- # This transliteration method should be enough
- # for many Western languages.
- transliterate = remove_accents
-
-
- def python_2_unicode_compatible(klass):
- """
- This decorator defines __unicode__ method and fixes
- __repr__ and __str__ methods under Python 2.
-
- To support Python 2 and 3 with a single code base,
- define __str__ and __repr__ methods returning unicode
- text and apply this decorator to the class.
-
- Original __repr__ and __str__ would be available
- as unicode_repr and __unicode__ (under both Python 2
- and Python 3).
- """
-
- if not issubclass(klass, object):
- raise ValueError("This decorator doesn't work for old-style classes")
-
- # both __unicode__ and unicode_repr are public because they
- # may be useful in console under Python 2.x
-
- # if __str__ or __repr__ are not overriden in a subclass,
- # they may be already fixed by this decorator in a parent class
- # and we shouldn't them again
-
- if not _was_fixed(klass.__str__):
- klass.__unicode__ = klass.__str__
- if not PY3:
- klass.__str__ = _7bit(_transliterated(klass.__unicode__))
-
- if not _was_fixed(klass.__repr__):
- klass.unicode_repr = klass.__repr__
- if not PY3:
- klass.__repr__ = _7bit(klass.unicode_repr)
-
- return klass
-
-
- def unicode_repr(obj):
- """
- For classes that was fixed with @python_2_unicode_compatible
- ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
- the result is returned without "u" letter (to make output the
- same under Python 2.x and Python 3.x); for other variables
- it is the same as ``repr``.
- """
- if PY3:
- return repr(obj)
-
- # Python 2.x
- if hasattr(obj, 'unicode_repr'):
- return obj.unicode_repr()
-
- if isinstance(obj, text_type):
- return repr(obj)[1:] # strip "u" letter from output
-
- return repr(obj)
-
-
- def _transliterated(method):
- def wrapper(self):
- return transliterate(method(self))
-
- update_wrapper(wrapper, method, ["__name__", "__doc__"])
- if hasattr(method, "_nltk_compat_7bit"):
- wrapper._nltk_compat_7bit = method._nltk_compat_7bit
-
- wrapper._nltk_compat_transliterated = True
- return wrapper
-
-
- def _7bit(method):
- def wrapper(self):
- return method(self).encode('ascii', 'backslashreplace')
-
- update_wrapper(wrapper, method, ["__name__", "__doc__"])
-
- if hasattr(method, "_nltk_compat_transliterated"):
- wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
-
- wrapper._nltk_compat_7bit = True
- return wrapper
-
-
- def _was_fixed(method):
- return getattr(method, "_nltk_compat_7bit", False) or getattr(
- method, "_nltk_compat_transliterated", False
- )
-
-
- class Fraction(fractions.Fraction):
- """
- This is a simplified backwards compatible version of fractions.Fraction
- from Python >=3.5. It adds the `_normalize` parameter such that it does
- not normalize the denominator to the Greatest Common Divisor (gcd) when
- the numerator is 0.
-
- This is most probably only used by the nltk.translate.bleu_score.py where
- numerator and denominator of the different ngram precisions are mutable.
- But the idea of "mutable" fraction might not be applicable to other usages,
- See http://stackoverflow.com/questions/34561265
-
- This objects should be deprecated once NLTK stops supporting Python < 3.5
- See https://github.com/nltk/nltk/issues/1330
- """
-
- def __new__(cls, numerator=0, denominator=None, _normalize=True):
- cls = super(Fraction, cls).__new__(cls, numerator, denominator)
- # To emulate fraction.Fraction.from_float across Python >=2.7,
- # check that numerator is an integer and denominator is not None.
- if not _normalize and type(numerator) == int and denominator:
- cls._numerator = numerator
- cls._denominator = denominator
- return cls
|