You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

490 lines
17 KiB

4 years ago
  1. # $Id: io.py 8129 2017-06-27 14:55:22Z grubert $
  2. # Author: David Goodger <goodger@python.org>
  3. # Copyright: This module has been placed in the public domain.
  4. """
  5. I/O classes provide a uniform API for low-level input and output. Subclasses
  6. exist for a variety of input/output mechanisms.
  7. """
  8. __docformat__ = 'reStructuredText'
  9. import sys
  10. import os
  11. import re
  12. import codecs
  13. from docutils import TransformSpec
  14. from docutils._compat import b
  15. from docutils.utils.error_reporting import locale_encoding, ErrorString, ErrorOutput
  16. class InputError(IOError): pass
  17. class OutputError(IOError): pass
  18. def check_encoding(stream, encoding):
  19. """Test, whether the encoding of `stream` matches `encoding`.
  20. Returns
  21. :None: if `encoding` or `stream.encoding` are not a valid encoding
  22. argument (e.g. ``None``) or `stream.encoding is missing.
  23. :True: if the encoding argument resolves to the same value as `encoding`,
  24. :False: if the encodings differ.
  25. """
  26. try:
  27. return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
  28. except (LookupError, AttributeError, TypeError):
  29. return None
  30. class Input(TransformSpec):
  31. """
  32. Abstract base class for input wrappers.
  33. """
  34. component_type = 'input'
  35. default_source_path = None
  36. def __init__(self, source=None, source_path=None, encoding=None,
  37. error_handler='strict'):
  38. self.encoding = encoding
  39. """Text encoding for the input source."""
  40. self.error_handler = error_handler
  41. """Text decoding error handler."""
  42. self.source = source
  43. """The source of input data."""
  44. self.source_path = source_path
  45. """A text reference to the source."""
  46. if not source_path:
  47. self.source_path = self.default_source_path
  48. self.successful_encoding = None
  49. """The encoding that successfully decoded the source data."""
  50. def __repr__(self):
  51. return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
  52. self.source_path)
  53. def read(self):
  54. raise NotImplementedError
  55. def decode(self, data):
  56. """
  57. Decode a string, `data`, heuristically.
  58. Raise UnicodeError if unsuccessful.
  59. The client application should call ``locale.setlocale`` at the
  60. beginning of processing::
  61. locale.setlocale(locale.LC_ALL, '')
  62. """
  63. if self.encoding and self.encoding.lower() == 'unicode':
  64. assert isinstance(data, str), (
  65. 'input encoding is "unicode" '
  66. 'but input is not a unicode object')
  67. if isinstance(data, str):
  68. # Accept unicode even if self.encoding != 'unicode'.
  69. return data
  70. if self.encoding:
  71. # We believe the user/application when the encoding is
  72. # explicitly given.
  73. encodings = [self.encoding]
  74. else:
  75. data_encoding = self.determine_encoding_from_data(data)
  76. if data_encoding:
  77. # If the data declares its encoding (explicitly or via a BOM),
  78. # we believe it.
  79. encodings = [data_encoding]
  80. else:
  81. # Apply heuristics only if no encoding is explicitly given and
  82. # no BOM found. Start with UTF-8, because that only matches
  83. # data that *IS* UTF-8:
  84. encodings = ['utf-8', 'latin-1']
  85. if locale_encoding:
  86. encodings.insert(1, locale_encoding)
  87. for enc in encodings:
  88. try:
  89. decoded = str(data, enc, self.error_handler)
  90. self.successful_encoding = enc
  91. # Return decoded, removing BOMs.
  92. return decoded.replace('\ufeff', '')
  93. except (UnicodeError, LookupError) as err:
  94. error = err # in Python 3, the <exception instance> is
  95. # local to the except clause
  96. raise UnicodeError(
  97. 'Unable to decode input data. Tried the following encodings: '
  98. '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
  99. ErrorString(error)))
  100. coding_slug = re.compile(b(r"coding[:=]\s*([-\w.]+)"))
  101. """Encoding declaration pattern."""
  102. byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'), # 'utf-8-sig' new in v2.5
  103. (codecs.BOM_UTF16_BE, 'utf-16-be'),
  104. (codecs.BOM_UTF16_LE, 'utf-16-le'),)
  105. """Sequence of (start_bytes, encoding) tuples for encoding detection.
  106. The first bytes of input data are checked against the start_bytes strings.
  107. A match indicates the given encoding."""
  108. def determine_encoding_from_data(self, data):
  109. """
  110. Try to determine the encoding of `data` by looking *in* `data`.
  111. Check for a byte order mark (BOM) or an encoding declaration.
  112. """
  113. # check for a byte order mark:
  114. for start_bytes, encoding in self.byte_order_marks:
  115. if data.startswith(start_bytes):
  116. return encoding
  117. # check for an encoding declaration pattern in first 2 lines of file:
  118. for line in data.splitlines()[:2]:
  119. match = self.coding_slug.search(line)
  120. if match:
  121. return match.group(1).decode('ascii')
  122. return None
  123. class Output(TransformSpec):
  124. """
  125. Abstract base class for output wrappers.
  126. """
  127. component_type = 'output'
  128. default_destination_path = None
  129. def __init__(self, destination=None, destination_path=None,
  130. encoding=None, error_handler='strict'):
  131. self.encoding = encoding
  132. """Text encoding for the output destination."""
  133. self.error_handler = error_handler or 'strict'
  134. """Text encoding error handler."""
  135. self.destination = destination
  136. """The destination for output data."""
  137. self.destination_path = destination_path
  138. """A text reference to the destination."""
  139. if not destination_path:
  140. self.destination_path = self.default_destination_path
  141. def __repr__(self):
  142. return ('%s: destination=%r, destination_path=%r'
  143. % (self.__class__, self.destination, self.destination_path))
  144. def write(self, data):
  145. """`data` is a Unicode string, to be encoded by `self.encode`."""
  146. raise NotImplementedError
  147. def encode(self, data):
  148. if self.encoding and self.encoding.lower() == 'unicode':
  149. assert isinstance(data, str), (
  150. 'the encoding given is "unicode" but the output is not '
  151. 'a Unicode string')
  152. return data
  153. if not isinstance(data, str):
  154. # Non-unicode (e.g. bytes) output.
  155. return data
  156. else:
  157. return data.encode(self.encoding, self.error_handler)
  158. class FileInput(Input):
  159. """
  160. Input for single, simple file-like objects.
  161. """
  162. def __init__(self, source=None, source_path=None,
  163. encoding=None, error_handler='strict',
  164. autoclose=True, mode='rU', **kwargs):
  165. """
  166. :Parameters:
  167. - `source`: either a file-like object (which is read directly), or
  168. `None` (which implies `sys.stdin` if no `source_path` given).
  169. - `source_path`: a path to a file, which is opened and then read.
  170. - `encoding`: the expected text encoding of the input file.
  171. - `error_handler`: the encoding error handler to use.
  172. - `autoclose`: close automatically after read (except when
  173. `sys.stdin` is the source).
  174. - `mode`: how the file is to be opened (see standard function
  175. `open`). The default 'rU' provides universal newline support
  176. for text files.
  177. """
  178. Input.__init__(self, source, source_path, encoding, error_handler)
  179. self.autoclose = autoclose
  180. self._stderr = ErrorOutput()
  181. # deprecation warning
  182. for key in kwargs:
  183. if key == 'handle_io_errors':
  184. sys.stderr.write('deprecation warning: '
  185. 'io.FileInput() argument `handle_io_errors` '
  186. 'is ignored since "Docutils 0.10 (2012-12-16)" '
  187. 'and will soon be removed.')
  188. else:
  189. raise TypeError('__init__() got an unexpected keyword '
  190. "argument '%s'" % key)
  191. if source is None:
  192. if source_path:
  193. # Specify encoding in Python 3
  194. if sys.version_info >= (3,0):
  195. kwargs = {'encoding': self.encoding,
  196. 'errors': self.error_handler}
  197. else:
  198. kwargs = {}
  199. try:
  200. self.source = open(source_path, mode, **kwargs)
  201. except IOError as error:
  202. raise InputError(error.errno, error.strerror, source_path)
  203. else:
  204. self.source = sys.stdin
  205. elif (sys.version_info >= (3,0) and
  206. check_encoding(self.source, self.encoding) is False):
  207. # TODO: re-open, warn or raise error?
  208. raise UnicodeError('Encoding clash: encoding given is "%s" '
  209. 'but source is opened with encoding "%s".' %
  210. (self.encoding, self.source.encoding))
  211. if not source_path:
  212. try:
  213. self.source_path = self.source.name
  214. except AttributeError:
  215. pass
  216. def read(self):
  217. """
  218. Read and decode a single file and return the data (Unicode string).
  219. """
  220. try: # In Python < 2.5, try...except has to be nested in try...finally.
  221. try:
  222. if self.source is sys.stdin and sys.version_info >= (3,0):
  223. # read as binary data to circumvent auto-decoding
  224. data = self.source.buffer.read()
  225. # normalize newlines
  226. data = b('\n').join(data.splitlines()) + b('\n')
  227. else:
  228. data = self.source.read()
  229. except (UnicodeError, LookupError) as err: # (in Py3k read() decodes)
  230. if not self.encoding and self.source_path:
  231. # re-read in binary mode and decode with heuristics
  232. b_source = open(self.source_path, 'rb')
  233. data = b_source.read()
  234. b_source.close()
  235. # normalize newlines
  236. data = b('\n').join(data.splitlines()) + b('\n')
  237. else:
  238. raise
  239. finally:
  240. if self.autoclose:
  241. self.close()
  242. return self.decode(data)
  243. def readlines(self):
  244. """
  245. Return lines of a single file as list of Unicode strings.
  246. """
  247. return self.read().splitlines(True)
  248. def close(self):
  249. if self.source is not sys.stdin:
  250. self.source.close()
  251. class FileOutput(Output):
  252. """
  253. Output for single, simple file-like objects.
  254. """
  255. mode = 'w'
  256. """The mode argument for `open()`."""
  257. # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
  258. # (Do not use binary mode ('wb') for text files, as this prevents the
  259. # conversion of newlines to the system specific default.)
  260. def __init__(self, destination=None, destination_path=None,
  261. encoding=None, error_handler='strict', autoclose=True,
  262. handle_io_errors=None, mode=None):
  263. """
  264. :Parameters:
  265. - `destination`: either a file-like object (which is written
  266. directly) or `None` (which implies `sys.stdout` if no
  267. `destination_path` given).
  268. - `destination_path`: a path to a file, which is opened and then
  269. written.
  270. - `encoding`: the text encoding of the output file.
  271. - `error_handler`: the encoding error handler to use.
  272. - `autoclose`: close automatically after write (except when
  273. `sys.stdout` or `sys.stderr` is the destination).
  274. - `handle_io_errors`: ignored, deprecated, will be removed.
  275. - `mode`: how the file is to be opened (see standard function
  276. `open`). The default is 'w', providing universal newline
  277. support for text files.
  278. """
  279. Output.__init__(self, destination, destination_path,
  280. encoding, error_handler)
  281. self.opened = True
  282. self.autoclose = autoclose
  283. if mode is not None:
  284. self.mode = mode
  285. self._stderr = ErrorOutput()
  286. if destination is None:
  287. if destination_path:
  288. self.opened = False
  289. else:
  290. self.destination = sys.stdout
  291. elif (# destination is file-type object -> check mode:
  292. mode and hasattr(self.destination, 'mode')
  293. and mode != self.destination.mode):
  294. print(('Warning: Destination mode "%s" '
  295. 'differs from specified mode "%s"' %
  296. (self.destination.mode, mode)), file=self._stderr)
  297. if not destination_path:
  298. try:
  299. self.destination_path = self.destination.name
  300. except AttributeError:
  301. pass
  302. def open(self):
  303. # Specify encoding in Python 3.
  304. if sys.version_info >= (3,0) and 'b' not in self.mode:
  305. kwargs = {'encoding': self.encoding,
  306. 'errors': self.error_handler}
  307. else:
  308. kwargs = {}
  309. try:
  310. self.destination = open(self.destination_path, self.mode, **kwargs)
  311. except IOError as error:
  312. raise OutputError(error.errno, error.strerror,
  313. self.destination_path)
  314. self.opened = True
  315. def write(self, data):
  316. """Encode `data`, write it to a single file, and return it.
  317. With Python 3 or binary output mode, `data` is returned unchanged,
  318. except when specified encoding and output encoding differ.
  319. """
  320. if not self.opened:
  321. self.open()
  322. if ('b' not in self.mode and sys.version_info < (3,0)
  323. or check_encoding(self.destination, self.encoding) is False
  324. ):
  325. data = self.encode(data)
  326. if sys.version_info >= (3,0) and os.linesep != '\n':
  327. data = data.replace(b('\n'), b(os.linesep)) # fix endings
  328. try: # In Python < 2.5, try...except has to be nested in try...finally.
  329. try:
  330. self.destination.write(data)
  331. except TypeError as e:
  332. if sys.version_info >= (3,0) and isinstance(data, bytes):
  333. try:
  334. self.destination.buffer.write(data)
  335. except AttributeError:
  336. if check_encoding(self.destination,
  337. self.encoding) is False:
  338. raise ValueError('Encoding of %s (%s) differs \n'
  339. ' from specified encoding (%s)' %
  340. (self.destination_path or 'destination',
  341. self.destination.encoding, self.encoding))
  342. else:
  343. raise e
  344. except (UnicodeError, LookupError) as err:
  345. raise UnicodeError(
  346. 'Unable to encode output data. output-encoding is: '
  347. '%s.\n(%s)' % (self.encoding, ErrorString(err)))
  348. finally:
  349. if self.autoclose:
  350. self.close()
  351. return data
  352. def close(self):
  353. if self.destination not in (sys.stdout, sys.stderr):
  354. self.destination.close()
  355. self.opened = False
  356. class BinaryFileOutput(FileOutput):
  357. """
  358. A version of docutils.io.FileOutput which writes to a binary file.
  359. """
  360. # Used by core.publish_cmdline_to_binary() which in turn is used by
  361. # rst2odt (OpenOffice writer)
  362. mode = 'wb'
  363. class StringInput(Input):
  364. """
  365. Direct string input.
  366. """
  367. default_source_path = '<string>'
  368. def read(self):
  369. """Decode and return the source string."""
  370. return self.decode(self.source)
  371. class StringOutput(Output):
  372. """
  373. Direct string output.
  374. """
  375. default_destination_path = '<string>'
  376. def write(self, data):
  377. """Encode `data`, store it in `self.destination`, and return it."""
  378. self.destination = self.encode(data)
  379. return self.destination
  380. class NullInput(Input):
  381. """
  382. Degenerate input: read nothing.
  383. """
  384. default_source_path = 'null input'
  385. def read(self):
  386. """Return a null string."""
  387. return ''
  388. class NullOutput(Output):
  389. """
  390. Degenerate output: write nothing.
  391. """
  392. default_destination_path = 'null output'
  393. def write(self, data):
  394. """Do nothing ([don't even] send data to the bit bucket)."""
  395. pass
  396. class DocTreeInput(Input):
  397. """
  398. Adapter for document tree input.
  399. The document tree must be passed in the ``source`` parameter.
  400. """
  401. default_source_path = 'doctree input'
  402. def read(self):
  403. """Return the document tree."""
  404. return self.source