# -*- coding: utf-8 -*- # Natural Language Toolkit: Compatibility # # Copyright (C) 2001-2018 NLTK Project # # URL: # For license information, see LICENSE.TXT from __future__ import absolute_import, print_function import os import sys from functools import update_wrapper, wraps import fractions import unicodedata from six import string_types, text_type # Python 2/3 compatibility layer. Based on six. PY3 = sys.version_info[0] == 3 if PY3: def get_im_class(meth): return meth.__self__.__class__ import io StringIO = io.StringIO BytesIO = io.BytesIO from datetime import timezone UTC = timezone.utc from tempfile import TemporaryDirectory else: def get_im_class(meth): return meth.im_class try: from cStringIO import StringIO except ImportError: from StringIO import StringIO BytesIO = StringIO from datetime import tzinfo, timedelta ZERO = timedelta(0) HOUR = timedelta(hours=1) # A UTC class for python 2.7 class UTC(tzinfo): """UTC""" def utcoffset(self, dt): return ZERO def tzname(self, dt): return "UTC" def dst(self, dt): return ZERO UTC = UTC() import csv import codecs import cStringIO class UnicodeWriter: """ A CSV writer which will write rows to CSV file "f", which is encoded in the given encoding. see https://docs.python.org/2/library/csv.html """ def __init__(self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds): # Redirect output to a queue self.queue = cStringIO.StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f encoder_cls = codecs.getincrementalencoder(encoding) self.encoder = encoder_cls(errors=errors) def encode(self, data): if isinstance(data, string_types): return data.encode("utf-8") else: return data def writerow(self, row): self.writer.writerow([self.encode(s) for s in row]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... and reencode it into the target encoding data = self.encoder.encode(data, 'replace') # write to the target stream self.stream.write(data) # empty queue self.queue.truncate(0) import warnings as _warnings import os as _os from tempfile import mkdtemp class TemporaryDirectory(object): """Create and return a temporary directory. This has the same behavior as mkdtemp but can be used as a context manager. For example: with TemporaryDirectory() as tmpdir: ... Upon exiting the context, the directory and everything contained in it are removed. http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7 """ def __init__(self, suffix="", prefix="tmp", dir=None): self._closed = False self.name = None # Handle mkdtemp raising an exception self.name = mkdtemp(suffix, prefix, dir) def __repr__(self): return "<{} {!r}>".format(self.__class__.__name__, self.name) def __enter__(self): return self.name def cleanup(self, _warn=False): if self.name and not self._closed: try: self._rmtree(self.name) except (TypeError, AttributeError) as ex: # Issue #10188: Emit a warning on stderr # if the directory could not be cleaned # up due to missing globals if "None" not in str(ex): raise print("ERROR: {!r} while cleaning up {!r}".format(ex, self), file=sys.stderr) return self._closed = True if _warn: self._warn("Implicitly cleaning up {!r}".format(self), Warning) def __exit__(self, exc, value, tb): self.cleanup() def __del__(self): # Issue a Warning if implicit cleanup needed self.cleanup(_warn=True) # XXX (ncoghlan): The following code attempts to make # this class tolerant of the module nulling out process # that happens during CPython interpreter shutdown # Alas, it doesn't actually manage it. See issue #10188 _listdir = staticmethod(_os.listdir) _path_join = staticmethod(_os.path.join) _isdir = staticmethod(_os.path.isdir) _islink = staticmethod(_os.path.islink) _remove = staticmethod(_os.remove) _rmdir = staticmethod(_os.rmdir) _warn = _warnings.warn def _rmtree(self, path): # Essentially a stripped down version of shutil.rmtree. We can't # use globals because they may be None'ed out at shutdown. for name in self._listdir(path): fullname = self._path_join(path, name) try: isdir = (self._isdir(fullname) and not self._islink(fullname)) except OSError: isdir = False if isdir: self._rmtree(fullname) else: try: self._remove(fullname) except OSError: pass try: self._rmdir(path) except OSError: pass # ======= Compatibility for datasets that care about Python versions ======== # The following datasets have a /PY3 subdirectory containing # a full copy of the data which has been re-encoded or repickled. DATA_UPDATES = [("chunkers", "maxent_ne_chunker"), ("help", "tagsets"), ("taggers", "maxent_treebank_pos_tagger"), ("tokenizers", "punkt")] _PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES] def add_py3_data(path): if PY3: for item in _PY3_DATA_UPDATES: if item in str(path) and "/PY3" not in str(path): pos = path.index(item) + len(item) if path[pos:pos + 4] == ".zip": pos += 4 path = path[:pos] + "/PY3" + path[pos:] break return path # for use in adding /PY3 to the second (filename) argument # of the file pointers in data.py def py3_data(init_func): def _decorator(*args, **kwargs): args = (args[0], add_py3_data(args[1])) + args[2:] return init_func(*args, **kwargs) return wraps(init_func)(_decorator) # ======= Compatibility layer for __str__ and __repr__ ========== def remove_accents(text): if isinstance(text, bytes): text = text.decode('ascii') category = unicodedata.category # this gives a small (~10%) speedup return ''.join( c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn' ) # Select the best transliteration method: try: # Older versions of Unidecode are licensed under Artistic License; # assume an older version is installed. from unidecode import unidecode as transliterate except ImportError: try: # text-unidecode implementation is worse than Unidecode # implementation so Unidecode is preferred. from text_unidecode import unidecode as transliterate except ImportError: # This transliteration method should be enough # for many Western languages. transliterate = remove_accents def python_2_unicode_compatible(klass): """ This decorator defines __unicode__ method and fixes __repr__ and __str__ methods under Python 2. To support Python 2 and 3 with a single code base, define __str__ and __repr__ methods returning unicode text and apply this decorator to the class. Original __repr__ and __str__ would be available as unicode_repr and __unicode__ (under both Python 2 and Python 3). """ if not issubclass(klass, object): raise ValueError("This decorator doesn't work for old-style classes") # both __unicode__ and unicode_repr are public because they # may be useful in console under Python 2.x # if __str__ or __repr__ are not overriden in a subclass, # they may be already fixed by this decorator in a parent class # and we shouldn't them again if not _was_fixed(klass.__str__): klass.__unicode__ = klass.__str__ if not PY3: klass.__str__ = _7bit(_transliterated(klass.__unicode__)) if not _was_fixed(klass.__repr__): klass.unicode_repr = klass.__repr__ if not PY3: klass.__repr__ = _7bit(klass.unicode_repr) return klass def unicode_repr(obj): """ For classes that was fixed with @python_2_unicode_compatible ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings the result is returned without "u" letter (to make output the same under Python 2.x and Python 3.x); for other variables it is the same as ``repr``. """ if PY3: return repr(obj) # Python 2.x if hasattr(obj, 'unicode_repr'): return obj.unicode_repr() if isinstance(obj, text_type): return repr(obj)[1:] # strip "u" letter from output return repr(obj) def _transliterated(method): def wrapper(self): return transliterate(method(self)) update_wrapper(wrapper, method, ["__name__", "__doc__"]) if hasattr(method, "_nltk_compat_7bit"): wrapper._nltk_compat_7bit = method._nltk_compat_7bit wrapper._nltk_compat_transliterated = True return wrapper def _7bit(method): def wrapper(self): return method(self).encode('ascii', 'backslashreplace') update_wrapper(wrapper, method, ["__name__", "__doc__"]) if hasattr(method, "_nltk_compat_transliterated"): wrapper._nltk_compat_transliterated = ( method._nltk_compat_transliterated ) wrapper._nltk_compat_7bit = True return wrapper def _was_fixed(method): return (getattr(method, "_nltk_compat_7bit", False) or getattr(method, "_nltk_compat_transliterated", False)) class Fraction(fractions.Fraction): """ This is a simplified backwards compatible version of fractions.Fraction from Python >=3.5. It adds the `_normalize` parameter such that it does not normalize the denominator to the Greatest Common Divisor (gcd) when the numerator is 0. This is most probably only used by the nltk.translate.bleu_score.py where numerator and denominator of the different ngram precisions are mutable. But the idea of "mutable" fraction might not be applicable to other usages, See http://stackoverflow.com/questions/34561265 This objects should be deprecated once NLTK stops supporting Python < 3.5 See https://github.com/nltk/nltk/issues/1330 """ def __new__(cls, numerator=0, denominator=None, _normalize=True): cls = super(Fraction, cls).__new__(cls, numerator, denominator) # To emulate fraction.Fraction.from_float across Python >=2.7, # check that numerator is an integer and denominator is not None. if not _normalize and type(numerator) == int and denominator: cls._numerator = numerator cls._denominator = denominator return cls