You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

373 lines
12 KiB

4 years ago
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Compatibility
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. #
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from __future__ import absolute_import, print_function
  9. import os
  10. import sys
  11. from functools import update_wrapper, wraps
  12. import fractions
  13. import unicodedata
  14. from six import string_types, text_type
  15. # Python 2/3 compatibility layer. Based on six.
  16. PY3 = sys.version_info[0] == 3
  17. if PY3:
  18. def get_im_class(meth):
  19. return meth.__self__.__class__
  20. import io
  21. StringIO = io.StringIO
  22. BytesIO = io.BytesIO
  23. from datetime import timezone
  24. UTC = timezone.utc
  25. from tempfile import TemporaryDirectory
  26. else:
  27. def get_im_class(meth):
  28. return meth.im_class
  29. try:
  30. from cStringIO import StringIO
  31. except ImportError:
  32. from StringIO import StringIO
  33. BytesIO = StringIO
  34. from datetime import tzinfo, timedelta
  35. ZERO = timedelta(0)
  36. HOUR = timedelta(hours=1)
  37. # A UTC class for python 2.7
  38. class UTC(tzinfo):
  39. """UTC"""
  40. def utcoffset(self, dt):
  41. return ZERO
  42. def tzname(self, dt):
  43. return "UTC"
  44. def dst(self, dt):
  45. return ZERO
  46. UTC = UTC()
  47. import csv
  48. import codecs
  49. import cStringIO
  50. class UnicodeWriter:
  51. """
  52. A CSV writer which will write rows to CSV file "f",
  53. which is encoded in the given encoding.
  54. see https://docs.python.org/2/library/csv.html
  55. """
  56. def __init__(
  57. self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds
  58. ):
  59. # Redirect output to a queue
  60. self.queue = cStringIO.StringIO()
  61. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  62. self.stream = f
  63. encoder_cls = codecs.getincrementalencoder(encoding)
  64. self.encoder = encoder_cls(errors=errors)
  65. def encode(self, data):
  66. if isinstance(data, string_types):
  67. return data.encode("utf-8")
  68. else:
  69. return data
  70. def writerow(self, row):
  71. self.writer.writerow([self.encode(s) for s in row])
  72. # Fetch UTF-8 output from the queue ...
  73. data = self.queue.getvalue()
  74. data = data.decode("utf-8")
  75. # ... and reencode it into the target encoding
  76. data = self.encoder.encode(data, 'replace')
  77. # write to the target stream
  78. self.stream.write(data)
  79. # empty queue
  80. self.queue.truncate(0)
  81. import warnings as _warnings
  82. import os as _os
  83. from tempfile import mkdtemp
  84. class TemporaryDirectory(object):
  85. """Create and return a temporary directory. This has the same
  86. behavior as mkdtemp but can be used as a context manager. For
  87. example:
  88. with TemporaryDirectory() as tmpdir:
  89. ...
  90. Upon exiting the context, the directory and everything contained
  91. in it are removed.
  92. http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
  93. """
  94. def __init__(self, suffix="", prefix="tmp", dir=None):
  95. self._closed = False
  96. self.name = None # Handle mkdtemp raising an exception
  97. self.name = mkdtemp(suffix, prefix, dir)
  98. def __repr__(self):
  99. return "<{} {!r}>".format(self.__class__.__name__, self.name)
  100. def __enter__(self):
  101. return self.name
  102. def cleanup(self, _warn=False):
  103. if self.name and not self._closed:
  104. try:
  105. self._rmtree(self.name)
  106. except (TypeError, AttributeError) as ex:
  107. # Issue #10188: Emit a warning on stderr
  108. # if the directory could not be cleaned
  109. # up due to missing globals
  110. if "None" not in str(ex):
  111. raise
  112. print(
  113. "ERROR: {!r} while cleaning up {!r}".format(ex, self),
  114. file=sys.stderr,
  115. )
  116. return
  117. self._closed = True
  118. if _warn:
  119. self._warn("Implicitly cleaning up {!r}".format(self), Warning)
  120. def __exit__(self, exc, value, tb):
  121. self.cleanup()
  122. def __del__(self):
  123. # Issue a Warning if implicit cleanup needed
  124. self.cleanup(_warn=True)
  125. # XXX (ncoghlan): The following code attempts to make
  126. # this class tolerant of the module nulling out process
  127. # that happens during CPython interpreter shutdown
  128. # Alas, it doesn't actually manage it. See issue #10188
  129. _listdir = staticmethod(_os.listdir)
  130. _path_join = staticmethod(_os.path.join)
  131. _isdir = staticmethod(_os.path.isdir)
  132. _islink = staticmethod(_os.path.islink)
  133. _remove = staticmethod(_os.remove)
  134. _rmdir = staticmethod(_os.rmdir)
  135. _warn = _warnings.warn
  136. def _rmtree(self, path):
  137. # Essentially a stripped down version of shutil.rmtree. We can't
  138. # use globals because they may be None'ed out at shutdown.
  139. for name in self._listdir(path):
  140. fullname = self._path_join(path, name)
  141. try:
  142. isdir = self._isdir(fullname) and not self._islink(fullname)
  143. except OSError:
  144. isdir = False
  145. if isdir:
  146. self._rmtree(fullname)
  147. else:
  148. try:
  149. self._remove(fullname)
  150. except OSError:
  151. pass
  152. try:
  153. self._rmdir(path)
  154. except OSError:
  155. pass
  156. # ======= Compatibility for datasets that care about Python versions ========
  157. # The following datasets have a /PY3 subdirectory containing
  158. # a full copy of the data which has been re-encoded or repickled.
  159. DATA_UPDATES = [
  160. ("chunkers", "maxent_ne_chunker"),
  161. ("help", "tagsets"),
  162. ("taggers", "maxent_treebank_pos_tagger"),
  163. ("tokenizers", "punkt"),
  164. ]
  165. _PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
  166. def add_py3_data(path):
  167. if PY3:
  168. for item in _PY3_DATA_UPDATES:
  169. if item in str(path) and "/PY3" not in str(path):
  170. pos = path.index(item) + len(item)
  171. if path[pos : pos + 4] == ".zip":
  172. pos += 4
  173. path = path[:pos] + "/PY3" + path[pos:]
  174. break
  175. return path
  176. # for use in adding /PY3 to the second (filename) argument
  177. # of the file pointers in data.py
  178. def py3_data(init_func):
  179. def _decorator(*args, **kwargs):
  180. args = (args[0], add_py3_data(args[1])) + args[2:]
  181. return init_func(*args, **kwargs)
  182. return wraps(init_func)(_decorator)
  183. # ======= Compatibility layer for __str__ and __repr__ ==========
  184. def remove_accents(text):
  185. if isinstance(text, bytes):
  186. text = text.decode('ascii')
  187. category = unicodedata.category # this gives a small (~10%) speedup
  188. return ''.join(
  189. c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
  190. )
  191. # Select the best transliteration method:
  192. try:
  193. # Older versions of Unidecode are licensed under Artistic License;
  194. # assume an older version is installed.
  195. from unidecode import unidecode as transliterate
  196. except ImportError:
  197. try:
  198. # text-unidecode implementation is worse than Unidecode
  199. # implementation so Unidecode is preferred.
  200. from text_unidecode import unidecode as transliterate
  201. except ImportError:
  202. # This transliteration method should be enough
  203. # for many Western languages.
  204. transliterate = remove_accents
  205. def python_2_unicode_compatible(klass):
  206. """
  207. This decorator defines __unicode__ method and fixes
  208. __repr__ and __str__ methods under Python 2.
  209. To support Python 2 and 3 with a single code base,
  210. define __str__ and __repr__ methods returning unicode
  211. text and apply this decorator to the class.
  212. Original __repr__ and __str__ would be available
  213. as unicode_repr and __unicode__ (under both Python 2
  214. and Python 3).
  215. """
  216. if not issubclass(klass, object):
  217. raise ValueError("This decorator doesn't work for old-style classes")
  218. # both __unicode__ and unicode_repr are public because they
  219. # may be useful in console under Python 2.x
  220. # if __str__ or __repr__ are not overriden in a subclass,
  221. # they may be already fixed by this decorator in a parent class
  222. # and we shouldn't them again
  223. if not _was_fixed(klass.__str__):
  224. klass.__unicode__ = klass.__str__
  225. if not PY3:
  226. klass.__str__ = _7bit(_transliterated(klass.__unicode__))
  227. if not _was_fixed(klass.__repr__):
  228. klass.unicode_repr = klass.__repr__
  229. if not PY3:
  230. klass.__repr__ = _7bit(klass.unicode_repr)
  231. return klass
  232. def unicode_repr(obj):
  233. """
  234. For classes that was fixed with @python_2_unicode_compatible
  235. ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
  236. the result is returned without "u" letter (to make output the
  237. same under Python 2.x and Python 3.x); for other variables
  238. it is the same as ``repr``.
  239. """
  240. if PY3:
  241. return repr(obj)
  242. # Python 2.x
  243. if hasattr(obj, 'unicode_repr'):
  244. return obj.unicode_repr()
  245. if isinstance(obj, text_type):
  246. return repr(obj)[1:] # strip "u" letter from output
  247. return repr(obj)
  248. def _transliterated(method):
  249. def wrapper(self):
  250. return transliterate(method(self))
  251. update_wrapper(wrapper, method, ["__name__", "__doc__"])
  252. if hasattr(method, "_nltk_compat_7bit"):
  253. wrapper._nltk_compat_7bit = method._nltk_compat_7bit
  254. wrapper._nltk_compat_transliterated = True
  255. return wrapper
  256. def _7bit(method):
  257. def wrapper(self):
  258. return method(self).encode('ascii', 'backslashreplace')
  259. update_wrapper(wrapper, method, ["__name__", "__doc__"])
  260. if hasattr(method, "_nltk_compat_transliterated"):
  261. wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
  262. wrapper._nltk_compat_7bit = True
  263. return wrapper
  264. def _was_fixed(method):
  265. return getattr(method, "_nltk_compat_7bit", False) or getattr(
  266. method, "_nltk_compat_transliterated", False
  267. )
  268. class Fraction(fractions.Fraction):
  269. """
  270. This is a simplified backwards compatible version of fractions.Fraction
  271. from Python >=3.5. It adds the `_normalize` parameter such that it does
  272. not normalize the denominator to the Greatest Common Divisor (gcd) when
  273. the numerator is 0.
  274. This is most probably only used by the nltk.translate.bleu_score.py where
  275. numerator and denominator of the different ngram precisions are mutable.
  276. But the idea of "mutable" fraction might not be applicable to other usages,
  277. See http://stackoverflow.com/questions/34561265
  278. This objects should be deprecated once NLTK stops supporting Python < 3.5
  279. See https://github.com/nltk/nltk/issues/1330
  280. """
  281. def __new__(cls, numerator=0, denominator=None, _normalize=True):
  282. cls = super(Fraction, cls).__new__(cls, numerator, denominator)
  283. # To emulate fraction.Fraction.from_float across Python >=2.7,
  284. # check that numerator is an integer and denominator is not None.
  285. if not _normalize and type(numerator) == int and denominator:
  286. cls._numerator = numerator
  287. cls._denominator = denominator
  288. return cls