1487 lines
54 KiB
Python
1487 lines
54 KiB
Python
# Natural Language Toolkit: Utility functions
|
|
#
|
|
# Copyright (C) 2001-2018 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
Functions to find and load NLTK resource files, such as corpora,
|
|
grammars, and saved processing objects. Resource files are identified
|
|
using URLs, such as ``nltk:corpora/abc/rural.txt`` or
|
|
``http://nltk.org/sample/toy.cfg``. The following URL protocols are
|
|
supported:
|
|
|
|
- ``file:path``: Specifies the file whose path is *path*.
|
|
Both relative and absolute paths may be used.
|
|
|
|
- ``http://host/path``: Specifies the file stored on the web
|
|
server *host* at path *path*.
|
|
|
|
- ``nltk:path``: Specifies the file stored in the NLTK data
|
|
package at *path*. NLTK will search for these files in the
|
|
directories specified by ``nltk.data.path``.
|
|
|
|
If no protocol is specified, then the default protocol ``nltk:`` will
|
|
be used.
|
|
|
|
This module provides to functions that can be used to access a
|
|
resource file, given its URL: ``load()`` loads a given resource, and
|
|
adds it to a resource cache; and ``retrieve()`` copies a given resource
|
|
to a local file.
|
|
"""
|
|
from __future__ import print_function, unicode_literals
|
|
from __future__ import division
|
|
from abc import ABCMeta, abstractmethod
|
|
from six import add_metaclass
|
|
|
|
import functools
|
|
import textwrap
|
|
import io
|
|
import os
|
|
import re
|
|
import sys
|
|
import zipfile
|
|
import codecs
|
|
|
|
from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE
|
|
|
|
try: # Python 3.
|
|
textwrap_indent = functools.partial(textwrap.indent, prefix=' ')
|
|
except AttributeError: # Python 2; indent() not available for Python2.
|
|
textwrap_fill = functools.partial(textwrap.fill,
|
|
initial_indent=' ',
|
|
subsequent_indent=' ',
|
|
replace_whitespace=False)
|
|
def textwrap_indent(text):
|
|
return '\n'.join(textwrap_fill(line) for line in text.splitlines())
|
|
|
|
try:
|
|
from zlib import Z_SYNC_FLUSH as FLUSH
|
|
except ImportError:
|
|
from zlib import Z_FINISH as FLUSH
|
|
|
|
try:
|
|
import cPickle as pickle
|
|
except ImportError:
|
|
import pickle
|
|
|
|
from six import string_types, text_type
|
|
from six.moves.urllib.request import urlopen, url2pathname
|
|
|
|
# this import should be more specific:
|
|
import nltk
|
|
from nltk.compat import py3_data, add_py3_data, BytesIO
|
|
|
|
######################################################################
|
|
# Search Path
|
|
######################################################################
|
|
|
|
path = []
|
|
"""A list of directories where the NLTK data package might reside.
|
|
These directories will be checked in order when looking for a
|
|
resource in the data package. Note that this allows users to
|
|
substitute in their own versions of resources, if they have them
|
|
(e.g., in their home directory under ~/nltk_data)."""
|
|
|
|
# User-specified locations:
|
|
_paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep)
|
|
path += [d for d in _paths_from_env if d]
|
|
if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
|
|
path.append(os.path.expanduser(str('~/nltk_data')))
|
|
|
|
if sys.platform.startswith('win'):
|
|
# Common locations on Windows:
|
|
path += [
|
|
str(r'C:\nltk_data'), str(r'D:\nltk_data'), str(r'E:\nltk_data'),
|
|
os.path.join(sys.prefix, str('nltk_data')),
|
|
os.path.join(sys.prefix, str('share'), str('nltk_data')),
|
|
os.path.join(sys.prefix, str('lib'), str('nltk_data')),
|
|
os.path.join(
|
|
os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data'))
|
|
]
|
|
else:
|
|
# Common locations on UNIX & OS X:
|
|
path += [
|
|
str('/usr/share/nltk_data'),
|
|
str('/usr/local/share/nltk_data'),
|
|
str('/usr/lib/nltk_data'),
|
|
str('/usr/local/lib/nltk_data'),
|
|
os.path.join(sys.prefix, str('nltk_data')),
|
|
os.path.join(sys.prefix, str('share'), str('nltk_data')),
|
|
os.path.join(sys.prefix, str('lib'), str('nltk_data'))
|
|
]
|
|
|
|
|
|
######################################################################
|
|
# Util Functions
|
|
######################################################################
|
|
|
|
def gzip_open_unicode(filename, mode="rb", compresslevel=9, encoding='utf-8',
|
|
fileobj=None, errors=None, newline=None):
|
|
if fileobj is None:
|
|
fileobj = GzipFile(filename, mode, compresslevel, fileobj)
|
|
return io.TextIOWrapper(fileobj, encoding, errors, newline)
|
|
|
|
|
|
def split_resource_url(resource_url):
|
|
"""
|
|
Splits a resource url into "<protocol>:<path>".
|
|
|
|
>>> windows = sys.platform.startswith('win')
|
|
>>> split_resource_url('nltk:home/nltk')
|
|
('nltk', 'home/nltk')
|
|
>>> split_resource_url('nltk:/home/nltk')
|
|
('nltk', '/home/nltk')
|
|
>>> split_resource_url('file:/home/nltk')
|
|
('file', '/home/nltk')
|
|
>>> split_resource_url('file:///home/nltk')
|
|
('file', '/home/nltk')
|
|
>>> split_resource_url('file:///C:/home/nltk')
|
|
('file', '/C:/home/nltk')
|
|
"""
|
|
protocol, path_ = resource_url.split(':', 1)
|
|
if protocol == 'nltk':
|
|
pass
|
|
elif protocol == 'file':
|
|
if path_.startswith('/'):
|
|
path_ = '/' + path_.lstrip('/')
|
|
else:
|
|
path_ = re.sub(r'^/{0,2}', '', path_)
|
|
return protocol, path_
|
|
|
|
|
|
def normalize_resource_url(resource_url):
|
|
r"""
|
|
Normalizes a resource url
|
|
|
|
>>> windows = sys.platform.startswith('win')
|
|
>>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \
|
|
... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg'))
|
|
True
|
|
>>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file'
|
|
True
|
|
>>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg'
|
|
True
|
|
>>> normalize_resource_url('nltk:home/nltk')
|
|
'nltk:home/nltk'
|
|
>>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk'
|
|
True
|
|
>>> normalize_resource_url('http://example.com/dir/file')
|
|
'http://example.com/dir/file'
|
|
>>> normalize_resource_url('dir/file')
|
|
'nltk:dir/file'
|
|
"""
|
|
try:
|
|
protocol, name = split_resource_url(resource_url)
|
|
except ValueError:
|
|
# the resource url has no protocol, use the nltk protocol by default
|
|
protocol = 'nltk'
|
|
name = resource_url
|
|
# use file protocol if the path is an absolute path
|
|
if protocol == 'nltk' and os.path.isabs(name):
|
|
protocol = 'file://'
|
|
name = normalize_resource_name(name, False, None)
|
|
elif protocol == 'file':
|
|
protocol = 'file://'
|
|
# name is absolute
|
|
name = normalize_resource_name(name, False, None)
|
|
elif protocol == 'nltk':
|
|
protocol = 'nltk:'
|
|
name = normalize_resource_name(name, True)
|
|
else:
|
|
# handled by urllib
|
|
protocol += '://'
|
|
return ''.join([protocol, name])
|
|
|
|
|
|
def normalize_resource_name(resource_name, allow_relative=True, relative_path=None):
|
|
"""
|
|
:type resource_name: str or unicode
|
|
:param resource_name: The name of the resource to search for.
|
|
Resource names are posix-style relative path names, such as
|
|
``corpora/brown``. Directory names will automatically
|
|
be converted to a platform-appropriate path separator.
|
|
Directory trailing slashes are preserved
|
|
|
|
>>> windows = sys.platform.startswith('win')
|
|
>>> normalize_resource_name('.', True)
|
|
'./'
|
|
>>> normalize_resource_name('./', True)
|
|
'./'
|
|
>>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file'
|
|
True
|
|
>>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file'
|
|
True
|
|
>>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file'
|
|
True
|
|
>>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file'
|
|
True
|
|
>>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file'
|
|
True
|
|
"""
|
|
is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(os.path.sep)
|
|
if sys.platform.startswith('win'):
|
|
resource_name = resource_name.lstrip('/')
|
|
else:
|
|
resource_name = re.sub(r'^/+', '/', resource_name)
|
|
if allow_relative:
|
|
resource_name = os.path.normpath(resource_name)
|
|
else:
|
|
if relative_path is None:
|
|
relative_path = os.curdir
|
|
resource_name = os.path.abspath(
|
|
os.path.join(relative_path, resource_name))
|
|
resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/')
|
|
if sys.platform.startswith('win') and os.path.isabs(resource_name):
|
|
resource_name = '/' + resource_name
|
|
if is_dir and not resource_name.endswith('/'):
|
|
resource_name += '/'
|
|
return resource_name
|
|
|
|
|
|
######################################################################
|
|
# Path Pointers
|
|
######################################################################
|
|
|
|
@add_metaclass(ABCMeta)
|
|
class PathPointer(object):
|
|
"""
|
|
An abstract base class for 'path pointers,' used by NLTK's data
|
|
package to identify specific paths. Two subclasses exist:
|
|
``FileSystemPathPointer`` identifies a file that can be accessed
|
|
directly via a given absolute path. ``ZipFilePathPointer``
|
|
identifies a file contained within a zipfile, that can be accessed
|
|
by reading that zipfile.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def open(self, encoding=None):
|
|
"""
|
|
Return a seekable read-only stream that can be used to read
|
|
the contents of the file identified by this path pointer.
|
|
|
|
:raise IOError: If the path specified by this pointer does
|
|
not contain a readable file.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def file_size(self):
|
|
"""
|
|
Return the size of the file pointed to by this path pointer,
|
|
in bytes.
|
|
|
|
:raise IOError: If the path specified by this pointer does
|
|
not contain a readable file.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def join(self, fileid):
|
|
"""
|
|
Return a new path pointer formed by starting at the path
|
|
identified by this pointer, and then following the relative
|
|
path given by ``fileid``. The path components of ``fileid``
|
|
should be separated by forward slashes, regardless of
|
|
the underlying file system's path seperator character.
|
|
"""
|
|
|
|
|
|
class FileSystemPathPointer(PathPointer, text_type):
|
|
"""
|
|
A path pointer that identifies a file which can be accessed
|
|
directly via a given absolute path.
|
|
"""
|
|
@py3_data
|
|
def __init__(self, _path):
|
|
"""
|
|
Create a new path pointer for the given absolute path.
|
|
|
|
:raise IOError: If the given path does not exist.
|
|
"""
|
|
|
|
_path = os.path.abspath(_path)
|
|
if not os.path.exists(_path):
|
|
raise IOError('No such file or directory: %r' % _path)
|
|
self._path = _path
|
|
|
|
# There's no need to call str.__init__(), since it's a no-op;
|
|
# str does all of its setup work in __new__.
|
|
|
|
@property
|
|
def path(self):
|
|
"""The absolute path identified by this path pointer."""
|
|
return self._path
|
|
|
|
def open(self, encoding=None):
|
|
stream = open(self._path, 'rb')
|
|
if encoding is not None:
|
|
stream = SeekableUnicodeStreamReader(stream, encoding)
|
|
return stream
|
|
|
|
def file_size(self):
|
|
return os.stat(self._path).st_size
|
|
|
|
def join(self, fileid):
|
|
_path = os.path.join(self._path, fileid)
|
|
return FileSystemPathPointer(_path)
|
|
|
|
def __repr__(self):
|
|
# This should be a byte string under Python 2.x;
|
|
# we don't want transliteration here so
|
|
# @python_2_unicode_compatible is not used.
|
|
return str('FileSystemPathPointer(%r)' % self._path)
|
|
|
|
def __str__(self):
|
|
return self._path
|
|
|
|
|
|
class BufferedGzipFile(GzipFile):
|
|
"""
|
|
A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``.
|
|
This allows faster reads and writes of data to and from gzip-compressed
|
|
files at the cost of using more memory.
|
|
|
|
The default buffer size is 2MB.
|
|
|
|
``BufferedGzipFile`` is useful for loading large gzipped pickle objects
|
|
as well as writing large encoded feature files for classifier training.
|
|
"""
|
|
MB = 2 ** 20
|
|
SIZE = 2 * MB
|
|
|
|
@py3_data
|
|
def __init__(self, filename=None, mode=None, compresslevel=9,
|
|
fileobj=None, **kwargs):
|
|
"""
|
|
Return a buffered gzip file object.
|
|
|
|
:param filename: a filesystem path
|
|
:type filename: str
|
|
:param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab',
|
|
'w', or 'wb'
|
|
:type mode: str
|
|
:param compresslevel: The compresslevel argument is an integer from 1
|
|
to 9 controlling the level of compression; 1 is fastest and
|
|
produces the least compression, and 9 is slowest and produces the
|
|
most compression. The default is 9.
|
|
:type compresslevel: int
|
|
:param fileobj: a BytesIO stream to read from instead of a file.
|
|
:type fileobj: BytesIO
|
|
:param size: number of bytes to buffer during calls to read() and write()
|
|
:type size: int
|
|
:rtype: BufferedGzipFile
|
|
"""
|
|
GzipFile.__init__(self, filename, mode, compresslevel, fileobj)
|
|
self._size = kwargs.get('size', self.SIZE)
|
|
self._nltk_buffer = BytesIO()
|
|
# cStringIO does not support len.
|
|
self._len = 0
|
|
|
|
def _reset_buffer(self):
|
|
# For some reason calling BytesIO.truncate() here will lead to
|
|
# inconsistent writes so just set _buffer to a new BytesIO object.
|
|
self._nltk_buffer = BytesIO()
|
|
self._len = 0
|
|
|
|
def _write_buffer(self, data):
|
|
# Simply write to the buffer and increment the buffer size.
|
|
if data is not None:
|
|
self._nltk_buffer.write(data)
|
|
self._len += len(data)
|
|
|
|
def _write_gzip(self, data):
|
|
# Write the current buffer to the GzipFile.
|
|
GzipFile.write(self, self._nltk_buffer.getvalue())
|
|
# Then reset the buffer and write the new data to the buffer.
|
|
self._reset_buffer()
|
|
self._write_buffer(data)
|
|
|
|
def close(self):
|
|
# GzipFile.close() doesn't actuallly close anything.
|
|
if self.mode == GZ_WRITE:
|
|
self._write_gzip(None)
|
|
self._reset_buffer()
|
|
return GzipFile.close(self)
|
|
|
|
def flush(self, lib_mode=FLUSH):
|
|
self._nltk_buffer.flush()
|
|
GzipFile.flush(self, lib_mode)
|
|
|
|
def read(self, size=None):
|
|
if not size:
|
|
size = self._size
|
|
contents = BytesIO()
|
|
while True:
|
|
blocks = GzipFile.read(self, size)
|
|
if not blocks:
|
|
contents.flush()
|
|
break
|
|
contents.write(blocks)
|
|
return contents.getvalue()
|
|
else:
|
|
return GzipFile.read(self, size)
|
|
|
|
def write(self, data, size=-1):
|
|
"""
|
|
:param data: bytes to write to file or buffer
|
|
:type data: bytes
|
|
:param size: buffer at least size bytes before writing to file
|
|
:type size: int
|
|
"""
|
|
if not size:
|
|
size = self._size
|
|
if self._len + len(data) <= size:
|
|
self._write_buffer(data)
|
|
else:
|
|
self._write_gzip(data)
|
|
|
|
|
|
class GzipFileSystemPathPointer(FileSystemPathPointer):
|
|
"""
|
|
A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed
|
|
file located at a given absolute path. ``GzipFileSystemPathPointer`` is
|
|
appropriate for loading large gzip-compressed pickle objects efficiently.
|
|
"""
|
|
|
|
def open(self, encoding=None):
|
|
# Note: In >= Python3.5, GzipFile is already using a
|
|
# buffered reader in the backend which has a variable self._buffer
|
|
# See https://github.com/nltk/nltk/issues/1308
|
|
if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
|
|
stream = BufferedGzipFile(self._path, 'rb')
|
|
else:
|
|
stream = GzipFile(self._path, 'rb')
|
|
if encoding:
|
|
stream = SeekableUnicodeStreamReader(stream, encoding)
|
|
return stream
|
|
|
|
|
|
class ZipFilePathPointer(PathPointer):
|
|
"""
|
|
A path pointer that identifies a file contained within a zipfile,
|
|
which can be accessed by reading that zipfile.
|
|
"""
|
|
@py3_data
|
|
def __init__(self, zipfile, entry=''):
|
|
"""
|
|
Create a new path pointer pointing at the specified entry
|
|
in the given zipfile.
|
|
|
|
:raise IOError: If the given zipfile does not exist, or if it
|
|
does not contain the specified entry.
|
|
"""
|
|
if isinstance(zipfile, string_types):
|
|
zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
|
|
|
|
# Normalize the entry string, it should be relative:
|
|
entry = normalize_resource_name(entry, True, '/').lstrip('/')
|
|
|
|
# Check that the entry exists:
|
|
if entry != '.':
|
|
try:
|
|
zipfile.getinfo(entry)
|
|
except Exception:
|
|
# Sometimes directories aren't explicitly listed in
|
|
# the zip file. So if `entry` is a directory name,
|
|
# then check if the zipfile contains any files that
|
|
# are under the given directory.
|
|
if (entry.endswith('/') and
|
|
[n for n in zipfile.namelist() if n.startswith(entry)]):
|
|
pass # zipfile contains a file in that directory.
|
|
else:
|
|
# Otherwise, complain.
|
|
raise IOError('Zipfile %r does not contain %r' %
|
|
(zipfile.filename, entry))
|
|
self._zipfile = zipfile
|
|
self._entry = entry
|
|
|
|
@property
|
|
def zipfile(self):
|
|
"""
|
|
The zipfile.ZipFile object used to access the zip file
|
|
containing the entry identified by this path pointer.
|
|
"""
|
|
return self._zipfile
|
|
|
|
@property
|
|
def entry(self):
|
|
"""
|
|
The name of the file within zipfile that this path
|
|
pointer points to.
|
|
"""
|
|
return self._entry
|
|
|
|
def open(self, encoding=None):
|
|
data = self._zipfile.read(self._entry)
|
|
stream = BytesIO(data)
|
|
if self._entry.endswith('.gz'):
|
|
# Note: In >= Python3.5, GzipFile is already using a
|
|
# buffered reader in the backend which has a variable self._buffer
|
|
# See https://github.com/nltk/nltk/issues/1308
|
|
if sys.version.startswith('2.7') or sys.version.startswith('3.4'):
|
|
stream = BufferedGzipFile(self._entry, fileobj=stream)
|
|
else:
|
|
stream = GzipFile(self._entry, fileobj=stream)
|
|
elif encoding is not None:
|
|
stream = SeekableUnicodeStreamReader(stream, encoding)
|
|
return stream
|
|
|
|
def file_size(self):
|
|
return self._zipfile.getinfo(self._entry).file_size
|
|
|
|
def join(self, fileid):
|
|
entry = '%s/%s' % (self._entry, fileid)
|
|
return ZipFilePathPointer(self._zipfile, entry)
|
|
|
|
def __repr__(self):
|
|
return str('ZipFilePathPointer(%r, %r)') % (
|
|
self._zipfile.filename, self._entry)
|
|
|
|
def __str__(self):
|
|
return os.path.normpath(os.path.join(self._zipfile.filename,
|
|
self._entry))
|
|
|
|
|
|
######################################################################
|
|
# Access Functions
|
|
######################################################################
|
|
|
|
# Don't use a weak dictionary, because in the common case this
|
|
# causes a lot more reloading that necessary.
|
|
_resource_cache = {}
|
|
"""A dictionary used to cache resources so that they won't
|
|
need to be loaded more than once."""
|
|
|
|
|
|
def find(resource_name, paths=None):
|
|
"""
|
|
Find the given resource by searching through the directories and
|
|
zip files in paths, where a None or empty string specifies an absolute path.
|
|
Returns a corresponding path name. If the given resource is not
|
|
found, raise a ``LookupError``, whose message gives a pointer to
|
|
the installation instructions for the NLTK downloader.
|
|
|
|
Zip File Handling:
|
|
|
|
- If ``resource_name`` contains a component with a ``.zip``
|
|
extension, then it is assumed to be a zipfile; and the
|
|
remaining path components are used to look inside the zipfile.
|
|
|
|
- If any element of ``nltk.data.path`` has a ``.zip`` extension,
|
|
then it is assumed to be a zipfile.
|
|
|
|
- If a given resource name that does not contain any zipfile
|
|
component is not found initially, then ``find()`` will make a
|
|
second attempt to find that resource, by replacing each
|
|
component *p* in the path with *p.zip/p*. For example, this
|
|
allows ``find()`` to map the resource name
|
|
``corpora/chat80/cities.pl`` to a zip file path pointer to
|
|
``corpora/chat80.zip/chat80/cities.pl``.
|
|
|
|
- When using ``find()`` to locate a directory contained in a
|
|
zipfile, the resource name must end with the forward slash
|
|
character. Otherwise, ``find()`` will not locate the
|
|
directory.
|
|
|
|
:type resource_name: str or unicode
|
|
:param resource_name: The name of the resource to search for.
|
|
Resource names are posix-style relative path names, such as
|
|
``corpora/brown``. Directory names will be
|
|
automatically converted to a platform-appropriate path separator.
|
|
:rtype: str
|
|
"""
|
|
resource_name = normalize_resource_name(resource_name, True)
|
|
|
|
# Resolve default paths at runtime in-case the user overrides
|
|
# nltk.data.path
|
|
if paths is None:
|
|
paths = path
|
|
|
|
# Check if the resource name includes a zipfile name
|
|
m = re.match(r'(.*\.zip)/?(.*)$|', resource_name)
|
|
zipfile, zipentry = m.groups()
|
|
|
|
# Check each item in our path
|
|
for path_ in paths:
|
|
# Is the path item a zipfile?
|
|
if path_ and (os.path.isfile(path_) and path_.endswith('.zip')):
|
|
try:
|
|
return ZipFilePathPointer(path_, resource_name)
|
|
except IOError:
|
|
# resource not in zipfile
|
|
continue
|
|
|
|
# Is the path item a directory or is resource_name an absolute path?
|
|
elif not path_ or os.path.isdir(path_):
|
|
if zipfile is None:
|
|
p = os.path.join(path_, url2pathname(resource_name))
|
|
if os.path.exists(p):
|
|
if p.endswith('.gz'):
|
|
return GzipFileSystemPathPointer(p)
|
|
else:
|
|
return FileSystemPathPointer(p)
|
|
else:
|
|
p = os.path.join(path_, url2pathname(zipfile))
|
|
if os.path.exists(p):
|
|
try:
|
|
return ZipFilePathPointer(p, zipentry)
|
|
except IOError:
|
|
# resource not in zipfile
|
|
continue
|
|
|
|
# Fallback: if the path doesn't include a zip file, then try
|
|
# again, assuming that one of the path components is inside a
|
|
# zipfile of the same name.
|
|
if zipfile is None:
|
|
pieces = resource_name.split('/')
|
|
for i in range(len(pieces)):
|
|
modified_name = '/'.join(pieces[:i] +
|
|
[pieces[i] + '.zip'] + pieces[i:])
|
|
try:
|
|
return find(modified_name, paths)
|
|
except LookupError:
|
|
pass
|
|
|
|
# Identify the package (i.e. the .zip file) to download.
|
|
resource_zipname = resource_name.split('/')[1]
|
|
if resource_zipname.endswith('.zip'):
|
|
resource_zipname = resource_zipname.rpartition('.')[0]
|
|
# Display a friendly error message if the resource wasn't found:
|
|
msg = str("Resource \33[93m{resource}\033[0m not found.\n"
|
|
"Please use the NLTK Downloader to obtain the resource:\n\n"
|
|
"\33[31m" # To display red text in terminal.
|
|
">>> import nltk\n"
|
|
">>> nltk.download(\'{resource}\')\n"
|
|
"\033[0m").format(resource=resource_zipname)
|
|
msg = textwrap_indent(msg)
|
|
|
|
msg += '\n Searched in:' + ''.join('\n - %r' % d for d in paths)
|
|
sep = '*' * 70
|
|
resource_not_found = '\n%s\n%s\n%s\n' % (sep, msg, sep)
|
|
raise LookupError(resource_not_found)
|
|
|
|
|
|
def retrieve(resource_url, filename=None, verbose=True):
|
|
"""
|
|
Copy the given resource to a local file. If no filename is
|
|
specified, then use the URL's filename. If there is already a
|
|
file named ``filename``, then raise a ``ValueError``.
|
|
|
|
:type resource_url: str
|
|
:param resource_url: A URL specifying where the resource should be
|
|
loaded from. The default protocol is "nltk:", which searches
|
|
for the file in the the NLTK data package.
|
|
"""
|
|
resource_url = normalize_resource_url(resource_url)
|
|
if filename is None:
|
|
if resource_url.startswith('file:'):
|
|
filename = os.path.split(resource_url)[-1]
|
|
else:
|
|
filename = re.sub(r'(^\w+:)?.*/', '', resource_url)
|
|
if os.path.exists(filename):
|
|
filename = os.path.abspath(filename)
|
|
raise ValueError("File %r already exists!" % filename)
|
|
|
|
if verbose:
|
|
print('Retrieving %r, saving to %r' % (resource_url, filename))
|
|
|
|
# Open the input & output streams.
|
|
infile = _open(resource_url)
|
|
|
|
# Copy infile -> outfile, using 64k blocks.
|
|
with open(filename, "wb") as outfile:
|
|
while True:
|
|
s = infile.read(1024 * 64) # 64k blocks.
|
|
outfile.write(s)
|
|
if not s:
|
|
break
|
|
|
|
infile.close()
|
|
|
|
|
|
#: A dictionary describing the formats that are supported by NLTK's
|
|
#: load() method. Keys are format names, and values are format
|
|
#: descriptions.
|
|
FORMATS = {
|
|
'pickle': "A serialized python object, stored using the pickle module.",
|
|
'json': "A serialized python object, stored using the json module.",
|
|
'yaml': "A serialized python object, stored using the yaml module.",
|
|
'cfg': "A context free grammar.",
|
|
'pcfg': "A probabilistic CFG.",
|
|
'fcfg': "A feature CFG.",
|
|
'fol': "A list of first order logic expressions, parsed with "
|
|
"nltk.sem.logic.Expression.fromstring.",
|
|
'logic': "A list of first order logic expressions, parsed with "
|
|
"nltk.sem.logic.LogicParser. Requires an additional logic_parser "
|
|
"parameter",
|
|
'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.",
|
|
'raw': "The raw (byte string) contents of a file.",
|
|
'text': "The raw (unicode string) contents of a file. "
|
|
}
|
|
|
|
#: A dictionary mapping from file extensions to format names, used
|
|
#: by load() when format="auto" to decide the format for a
|
|
#: given resource url.
|
|
AUTO_FORMATS = {
|
|
'pickle': 'pickle',
|
|
'json': 'json',
|
|
'yaml': 'yaml',
|
|
'cfg': 'cfg',
|
|
'pcfg': 'pcfg',
|
|
'fcfg': 'fcfg',
|
|
'fol': 'fol',
|
|
'logic': 'logic',
|
|
'val': 'val',
|
|
'txt': 'text',
|
|
'text': 'text',
|
|
}
|
|
|
|
|
|
def load(resource_url, format='auto', cache=True, verbose=False,
|
|
logic_parser=None, fstruct_reader=None, encoding=None):
|
|
"""
|
|
Load a given resource from the NLTK data package. The following
|
|
resource formats are currently supported:
|
|
|
|
- ``pickle``
|
|
- ``json``
|
|
- ``yaml``
|
|
- ``cfg`` (context free grammars)
|
|
- ``pcfg`` (probabilistic CFGs)
|
|
- ``fcfg`` (feature-based CFGs)
|
|
- ``fol`` (formulas of First Order Logic)
|
|
- ``logic`` (Logical formulas to be parsed by the given logic_parser)
|
|
- ``val`` (valuation of First Order Logic model)
|
|
- ``text`` (the file contents as a unicode string)
|
|
- ``raw`` (the raw file contents as a byte string)
|
|
|
|
If no format is specified, ``load()`` will attempt to determine a
|
|
format based on the resource name's file extension. If that
|
|
fails, ``load()`` will raise a ``ValueError`` exception.
|
|
|
|
For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``),
|
|
it tries to decode the raw contents using UTF-8, and if that doesn't
|
|
work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding``
|
|
is specified.
|
|
|
|
:type resource_url: str
|
|
:param resource_url: A URL specifying where the resource should be
|
|
loaded from. The default protocol is "nltk:", which searches
|
|
for the file in the the NLTK data package.
|
|
:type cache: bool
|
|
:param cache: If true, add this resource to a cache. If load()
|
|
finds a resource in its cache, then it will return it from the
|
|
cache rather than loading it. The cache uses weak references,
|
|
so a resource wil automatically be expunged from the cache
|
|
when no more objects are using it.
|
|
:type verbose: bool
|
|
:param verbose: If true, print a message when loading a resource.
|
|
Messages are not displayed when a resource is retrieved from
|
|
the cache.
|
|
:type logic_parser: LogicParser
|
|
:param logic_parser: The parser that will be used to parse logical
|
|
expressions.
|
|
:type fstruct_reader: FeatStructReader
|
|
:param fstruct_reader: The parser that will be used to parse the
|
|
feature structure of an fcfg.
|
|
:type encoding: str
|
|
:param encoding: the encoding of the input; only used for text formats.
|
|
"""
|
|
resource_url = normalize_resource_url(resource_url)
|
|
resource_url = add_py3_data(resource_url)
|
|
|
|
# Determine the format of the resource.
|
|
if format == 'auto':
|
|
resource_url_parts = resource_url.split('.')
|
|
ext = resource_url_parts[-1]
|
|
if ext == 'gz':
|
|
ext = resource_url_parts[-2]
|
|
format = AUTO_FORMATS.get(ext)
|
|
if format is None:
|
|
raise ValueError('Could not determine format for %s based '
|
|
'on its file\nextension; use the "format" '
|
|
'argument to specify the format explicitly.'
|
|
% resource_url)
|
|
|
|
if format not in FORMATS:
|
|
raise ValueError('Unknown format type: %s!' % (format,))
|
|
|
|
# If we've cached the resource, then just return it.
|
|
if cache:
|
|
resource_val = _resource_cache.get((resource_url, format))
|
|
if resource_val is not None:
|
|
if verbose:
|
|
print('<<Using cached copy of %s>>' % (resource_url,))
|
|
return resource_val
|
|
|
|
# Let the user know what's going on.
|
|
if verbose:
|
|
print('<<Loading %s>>' % (resource_url,))
|
|
|
|
# Load the resource.
|
|
opened_resource = _open(resource_url)
|
|
|
|
if format == 'raw':
|
|
resource_val = opened_resource.read()
|
|
elif format == 'pickle':
|
|
resource_val = pickle.load(opened_resource)
|
|
elif format == 'json':
|
|
import json
|
|
from nltk.jsontags import json_tags
|
|
resource_val = json.load(opened_resource)
|
|
tag = None
|
|
if len(resource_val) != 1:
|
|
tag = next(resource_val.keys())
|
|
if tag not in json_tags:
|
|
raise ValueError('Unknown json tag.')
|
|
elif format == 'yaml':
|
|
import yaml
|
|
resource_val = yaml.load(opened_resource)
|
|
else:
|
|
# The resource is a text format.
|
|
binary_data = opened_resource.read()
|
|
if encoding is not None:
|
|
string_data = binary_data.decode(encoding)
|
|
else:
|
|
try:
|
|
string_data = binary_data.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
string_data = binary_data.decode('latin-1')
|
|
if format == 'text':
|
|
resource_val = string_data
|
|
elif format == 'cfg':
|
|
resource_val = nltk.grammar.CFG.fromstring(
|
|
string_data, encoding=encoding)
|
|
elif format == 'pcfg':
|
|
resource_val = nltk.grammar.PCFG.fromstring(
|
|
string_data, encoding=encoding)
|
|
elif format == 'fcfg':
|
|
resource_val = nltk.grammar.FeatureGrammar.fromstring(
|
|
string_data, logic_parser=logic_parser,
|
|
fstruct_reader=fstruct_reader, encoding=encoding)
|
|
elif format == 'fol':
|
|
resource_val = nltk.sem.read_logic(
|
|
string_data, logic_parser=nltk.sem.logic.LogicParser(),
|
|
encoding=encoding)
|
|
elif format == 'logic':
|
|
resource_val = nltk.sem.read_logic(
|
|
string_data, logic_parser=logic_parser, encoding=encoding)
|
|
elif format == 'val':
|
|
resource_val = nltk.sem.read_valuation(
|
|
string_data, encoding=encoding)
|
|
else:
|
|
raise AssertionError("Internal NLTK error: Format %s isn't "
|
|
"handled by nltk.data.load()" % (format,))
|
|
|
|
opened_resource.close()
|
|
|
|
# If requested, add it to the cache.
|
|
if cache:
|
|
try:
|
|
_resource_cache[(resource_url, format)] = resource_val
|
|
# TODO: add this line
|
|
# print('<<Caching a copy of %s>>' % (resource_url,))
|
|
except TypeError:
|
|
# We can't create weak references to some object types, like
|
|
# strings and tuples. For now, just don't cache them.
|
|
pass
|
|
|
|
return resource_val
|
|
|
|
|
|
def show_cfg(resource_url, escape='##'):
|
|
"""
|
|
Write out a grammar file, ignoring escaped and empty lines.
|
|
|
|
:type resource_url: str
|
|
:param resource_url: A URL specifying where the resource should be
|
|
loaded from. The default protocol is "nltk:", which searches
|
|
for the file in the the NLTK data package.
|
|
:type escape: str
|
|
:param escape: Prepended string that signals lines to be ignored
|
|
"""
|
|
resource_url = normalize_resource_url(resource_url)
|
|
resource_val = load(resource_url, format='text', cache=False)
|
|
lines = resource_val.splitlines()
|
|
for l in lines:
|
|
if l.startswith(escape):
|
|
continue
|
|
if re.match('^$', l):
|
|
continue
|
|
print(l)
|
|
|
|
|
|
def clear_cache():
|
|
"""
|
|
Remove all objects from the resource cache.
|
|
:see: load()
|
|
"""
|
|
_resource_cache.clear()
|
|
|
|
|
|
def _open(resource_url):
|
|
"""
|
|
Helper function that returns an open file object for a resource,
|
|
given its resource URL. If the given resource URL uses the "nltk:"
|
|
protocol, or uses no protocol, then use ``nltk.data.find`` to find
|
|
its path, and open it with the given mode; if the resource URL
|
|
uses the 'file' protocol, then open the file with the given mode;
|
|
otherwise, delegate to ``urllib2.urlopen``.
|
|
|
|
:type resource_url: str
|
|
:param resource_url: A URL specifying where the resource should be
|
|
loaded from. The default protocol is "nltk:", which searches
|
|
for the file in the the NLTK data package.
|
|
"""
|
|
resource_url = normalize_resource_url(resource_url)
|
|
protocol, path_ = split_resource_url(resource_url)
|
|
|
|
if protocol is None or protocol.lower() == 'nltk':
|
|
return find(path_, path + ['']).open()
|
|
elif protocol.lower() == 'file':
|
|
# urllib might not use mode='rb', so handle this one ourselves:
|
|
return find(path_, ['']).open()
|
|
else:
|
|
return urlopen(resource_url)
|
|
|
|
######################################################################
|
|
# Lazy Resource Loader
|
|
######################################################################
|
|
|
|
# We shouldn't apply @python_2_unicode_compatible
|
|
# decorator to LazyLoader, this is resource.__class__ responsibility.
|
|
|
|
|
|
class LazyLoader(object):
|
|
|
|
@py3_data
|
|
def __init__(self, _path):
|
|
self._path = _path
|
|
|
|
def __load(self):
|
|
resource = load(self._path)
|
|
# This is where the magic happens! Transform ourselves into
|
|
# the object by modifying our own __dict__ and __class__ to
|
|
# match that of `resource`.
|
|
self.__dict__ = resource.__dict__
|
|
self.__class__ = resource.__class__
|
|
|
|
def __getattr__(self, attr):
|
|
self.__load()
|
|
# This looks circular, but its not, since __load() changes our
|
|
# __class__ to something new:
|
|
return getattr(self, attr)
|
|
|
|
def __repr__(self):
|
|
self.__load()
|
|
# This looks circular, but its not, since __load() changes our
|
|
# __class__ to something new:
|
|
return repr(self)
|
|
|
|
######################################################################
|
|
# Open-On-Demand ZipFile
|
|
######################################################################
|
|
|
|
|
|
class OpenOnDemandZipFile(zipfile.ZipFile):
|
|
"""
|
|
A subclass of ``zipfile.ZipFile`` that closes its file pointer
|
|
whenever it is not using it; and re-opens it when it needs to read
|
|
data from the zipfile. This is useful for reducing the number of
|
|
open file handles when many zip files are being accessed at once.
|
|
``OpenOnDemandZipFile`` must be constructed from a filename, not a
|
|
file-like object (to allow re-opening). ``OpenOnDemandZipFile`` is
|
|
read-only (i.e. ``write()`` and ``writestr()`` are disabled.
|
|
"""
|
|
@py3_data
|
|
def __init__(self, filename):
|
|
if not isinstance(filename, string_types):
|
|
raise TypeError('ReopenableZipFile filename must be a string')
|
|
zipfile.ZipFile.__init__(self, filename)
|
|
assert self.filename == filename
|
|
self.close()
|
|
# After closing a ZipFile object, the _fileRefCnt needs to be cleared
|
|
# for Python2and3 compatible code.
|
|
self._fileRefCnt = 0
|
|
|
|
def read(self, name):
|
|
assert self.fp is None
|
|
self.fp = open(self.filename, 'rb')
|
|
value = zipfile.ZipFile.read(self, name)
|
|
# Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
|
|
# Since we only opened one file here, we add 1.
|
|
self._fileRefCnt += 1
|
|
self.close()
|
|
return value
|
|
|
|
def write(self, *args, **kwargs):
|
|
""":raise NotImplementedError: OpenOnDemandZipfile is read-only"""
|
|
raise NotImplementedError('OpenOnDemandZipfile is read-only')
|
|
|
|
def writestr(self, *args, **kwargs):
|
|
""":raise NotImplementedError: OpenOnDemandZipfile is read-only"""
|
|
raise NotImplementedError('OpenOnDemandZipfile is read-only')
|
|
|
|
def __repr__(self):
|
|
return repr(str('OpenOnDemandZipFile(%r)') % self.filename)
|
|
|
|
######################################################################
|
|
#{ Seekable Unicode Stream Reader
|
|
######################################################################
|
|
|
|
|
|
class SeekableUnicodeStreamReader(object):
|
|
"""
|
|
A stream reader that automatically encodes the source byte stream
|
|
into unicode (like ``codecs.StreamReader``); but still supports the
|
|
``seek()`` and ``tell()`` operations correctly. This is in contrast
|
|
to ``codecs.StreamReader``, which provide *broken* ``seek()`` and
|
|
``tell()`` methods.
|
|
|
|
This class was motivated by ``StreamBackedCorpusView``, which
|
|
makes extensive use of ``seek()`` and ``tell()``, and needs to be
|
|
able to handle unicode-encoded files.
|
|
|
|
Note: this class requires stateless decoders. To my knowledge,
|
|
this shouldn't cause a problem with any of python's builtin
|
|
unicode encodings.
|
|
"""
|
|
DEBUG = True # : If true, then perform extra sanity checks.
|
|
|
|
@py3_data
|
|
def __init__(self, stream, encoding, errors='strict'):
|
|
# Rewind the stream to its beginning.
|
|
stream.seek(0)
|
|
|
|
self.stream = stream
|
|
"""The underlying stream."""
|
|
|
|
self.encoding = encoding
|
|
"""The name of the encoding that should be used to encode the
|
|
underlying stream."""
|
|
|
|
self.errors = errors
|
|
"""The error mode that should be used when decoding data from
|
|
the underlying stream. Can be 'strict', 'ignore', or
|
|
'replace'."""
|
|
|
|
self.decode = codecs.getdecoder(encoding)
|
|
"""The function that is used to decode byte strings into
|
|
unicode strings."""
|
|
|
|
self.bytebuffer = b''
|
|
"""A buffer to use bytes that have been read but have not yet
|
|
been decoded. This is only used when the final bytes from
|
|
a read do not form a complete encoding for a character."""
|
|
|
|
self.linebuffer = None
|
|
"""A buffer used by ``readline()`` to hold characters that have
|
|
been read, but have not yet been returned by ``read()`` or
|
|
``readline()``. This buffer consists of a list of unicode
|
|
strings, where each string corresponds to a single line.
|
|
The final element of the list may or may not be a complete
|
|
line. Note that the existence of a linebuffer makes the
|
|
``tell()`` operation more complex, because it must backtrack
|
|
to the beginning of the buffer to determine the correct
|
|
file position in the underlying byte stream."""
|
|
|
|
self._rewind_checkpoint = 0
|
|
"""The file position at which the most recent read on the
|
|
underlying stream began. This is used, together with
|
|
``_rewind_numchars``, to backtrack to the beginning of
|
|
``linebuffer`` (which is required by ``tell()``)."""
|
|
|
|
self._rewind_numchars = None
|
|
"""The number of characters that have been returned since the
|
|
read that started at ``_rewind_checkpoint``. This is used,
|
|
together with ``_rewind_checkpoint``, to backtrack to the
|
|
beginning of ``linebuffer`` (which is required by ``tell()``)."""
|
|
|
|
self._bom = self._check_bom()
|
|
"""The length of the byte order marker at the beginning of
|
|
the stream (or None for no byte order marker)."""
|
|
|
|
#/////////////////////////////////////////////////////////////////
|
|
# Read methods
|
|
#/////////////////////////////////////////////////////////////////
|
|
|
|
def read(self, size=None):
|
|
"""
|
|
Read up to ``size`` bytes, decode them using this reader's
|
|
encoding, and return the resulting unicode string.
|
|
|
|
:param size: The maximum number of bytes to read. If not
|
|
specified, then read as many bytes as possible.
|
|
:type size: int
|
|
:rtype: unicode
|
|
"""
|
|
chars = self._read(size)
|
|
|
|
# If linebuffer is not empty, then include it in the result
|
|
if self.linebuffer:
|
|
chars = ''.join(self.linebuffer) + chars
|
|
self.linebuffer = None
|
|
self._rewind_numchars = None
|
|
|
|
return chars
|
|
|
|
def readline(self, size=None):
|
|
"""
|
|
Read a line of text, decode it using this reader's encoding,
|
|
and return the resulting unicode string.
|
|
|
|
:param size: The maximum number of bytes to read. If no
|
|
newline is encountered before ``size`` bytes have been read,
|
|
then the returned value may not be a complete line of text.
|
|
:type size: int
|
|
"""
|
|
# If we have a non-empty linebuffer, then return the first
|
|
# line from it. (Note that the last element of linebuffer may
|
|
# not be a complete line; so let _read() deal with it.)
|
|
if self.linebuffer and len(self.linebuffer) > 1:
|
|
line = self.linebuffer.pop(0)
|
|
self._rewind_numchars += len(line)
|
|
return line
|
|
|
|
readsize = size or 72
|
|
chars = ''
|
|
|
|
# If there's a remaining incomplete line in the buffer, add it.
|
|
if self.linebuffer:
|
|
chars += self.linebuffer.pop()
|
|
self.linebuffer = None
|
|
|
|
while True:
|
|
startpos = self.stream.tell() - len(self.bytebuffer)
|
|
new_chars = self._read(readsize)
|
|
|
|
# If we're at a '\r', then read one extra character, since
|
|
# it might be a '\n', to get the proper line ending.
|
|
if new_chars and new_chars.endswith('\r'):
|
|
new_chars += self._read(1)
|
|
|
|
chars += new_chars
|
|
lines = chars.splitlines(True)
|
|
if len(lines) > 1:
|
|
line = lines[0]
|
|
self.linebuffer = lines[1:]
|
|
self._rewind_numchars = (len(new_chars) -
|
|
(len(chars) - len(line)))
|
|
self._rewind_checkpoint = startpos
|
|
break
|
|
elif len(lines) == 1:
|
|
line0withend = lines[0]
|
|
line0withoutend = lines[0].splitlines(False)[0]
|
|
if line0withend != line0withoutend: # complete line
|
|
line = line0withend
|
|
break
|
|
|
|
if not new_chars or size is not None:
|
|
line = chars
|
|
break
|
|
|
|
# Read successively larger blocks of text.
|
|
if readsize < 8000:
|
|
readsize *= 2
|
|
|
|
return line
|
|
|
|
def readlines(self, sizehint=None, keepends=True):
|
|
"""
|
|
Read this file's contents, decode them using this reader's
|
|
encoding, and return it as a list of unicode lines.
|
|
|
|
:rtype: list(unicode)
|
|
:param sizehint: Ignored.
|
|
:param keepends: If false, then strip newlines.
|
|
"""
|
|
return self.read().splitlines(keepends)
|
|
|
|
def next(self):
|
|
"""Return the next decoded line from the underlying stream."""
|
|
line = self.readline()
|
|
if line:
|
|
return line
|
|
else:
|
|
raise StopIteration
|
|
|
|
def __next__(self):
|
|
return self.next()
|
|
|
|
def __iter__(self):
|
|
"""Return self"""
|
|
return self
|
|
|
|
def xreadlines(self):
|
|
"""Return self"""
|
|
return self
|
|
|
|
#/////////////////////////////////////////////////////////////////
|
|
# Pass-through methods & properties
|
|
#/////////////////////////////////////////////////////////////////
|
|
|
|
@property
|
|
def closed(self):
|
|
"""True if the underlying stream is closed."""
|
|
return self.stream.closed
|
|
|
|
@property
|
|
def name(self):
|
|
"""The name of the underlying stream."""
|
|
return self.stream.name
|
|
|
|
@property
|
|
def mode(self):
|
|
"""The mode of the underlying stream."""
|
|
return self.stream.mode
|
|
|
|
def close(self):
|
|
"""
|
|
Close the underlying stream.
|
|
"""
|
|
self.stream.close()
|
|
|
|
#/////////////////////////////////////////////////////////////////
|
|
# Seek and tell
|
|
#/////////////////////////////////////////////////////////////////
|
|
|
|
def seek(self, offset, whence=0):
|
|
"""
|
|
Move the stream to a new file position. If the reader is
|
|
maintaining any buffers, then they will be cleared.
|
|
|
|
:param offset: A byte count offset.
|
|
:param whence: If 0, then the offset is from the start of the file
|
|
(offset should be positive), if 1, then the offset is from the
|
|
current position (offset may be positive or negative); and if 2,
|
|
then the offset is from the end of the file (offset should
|
|
typically be negative).
|
|
"""
|
|
if whence == 1:
|
|
raise ValueError('Relative seek is not supported for '
|
|
'SeekableUnicodeStreamReader -- consider '
|
|
'using char_seek_forward() instead.')
|
|
self.stream.seek(offset, whence)
|
|
self.linebuffer = None
|
|
self.bytebuffer = b''
|
|
self._rewind_numchars = None
|
|
self._rewind_checkpoint = self.stream.tell()
|
|
|
|
def char_seek_forward(self, offset):
|
|
"""
|
|
Move the read pointer forward by ``offset`` characters.
|
|
"""
|
|
if offset < 0:
|
|
raise ValueError('Negative offsets are not supported')
|
|
# Clear all buffers.
|
|
self.seek(self.tell())
|
|
# Perform the seek operation.
|
|
self._char_seek_forward(offset)
|
|
|
|
def _char_seek_forward(self, offset, est_bytes=None):
|
|
"""
|
|
Move the file position forward by ``offset`` characters,
|
|
ignoring all buffers.
|
|
|
|
:param est_bytes: A hint, giving an estimate of the number of
|
|
bytes that will be needed to move forward by ``offset`` chars.
|
|
Defaults to ``offset``.
|
|
"""
|
|
if est_bytes is None:
|
|
est_bytes = offset
|
|
bytes = b''
|
|
|
|
while True:
|
|
# Read in a block of bytes.
|
|
newbytes = self.stream.read(est_bytes - len(bytes))
|
|
bytes += newbytes
|
|
|
|
# Decode the bytes to characters.
|
|
chars, bytes_decoded = self._incr_decode(bytes)
|
|
|
|
# If we got the right number of characters, then seek
|
|
# backwards over any truncated characters, and return.
|
|
if len(chars) == offset:
|
|
self.stream.seek(-len(bytes) + bytes_decoded, 1)
|
|
return
|
|
|
|
# If we went too far, then we can back-up until we get it
|
|
# right, using the bytes we've already read.
|
|
if len(chars) > offset:
|
|
while len(chars) > offset:
|
|
# Assume at least one byte/char.
|
|
est_bytes += offset - len(chars)
|
|
chars, bytes_decoded = self._incr_decode(bytes[:est_bytes])
|
|
self.stream.seek(-len(bytes) + bytes_decoded, 1)
|
|
return
|
|
|
|
# Otherwise, we haven't read enough bytes yet; loop again.
|
|
est_bytes += offset - len(chars)
|
|
|
|
def tell(self):
|
|
"""
|
|
Return the current file position on the underlying byte
|
|
stream. If this reader is maintaining any buffers, then the
|
|
returned file position will be the position of the beginning
|
|
of those buffers.
|
|
"""
|
|
# If nothing's buffered, then just return our current filepos:
|
|
if self.linebuffer is None:
|
|
return self.stream.tell() - len(self.bytebuffer)
|
|
|
|
# Otherwise, we'll need to backtrack the filepos until we
|
|
# reach the beginning of the buffer.
|
|
|
|
# Store our original file position, so we can return here.
|
|
orig_filepos = self.stream.tell()
|
|
|
|
# Calculate an estimate of where we think the newline is.
|
|
bytes_read = ((orig_filepos - len(self.bytebuffer)) -
|
|
self._rewind_checkpoint)
|
|
buf_size = sum(len(line) for line in self.linebuffer)
|
|
est_bytes = int((bytes_read * self._rewind_numchars /
|
|
(self._rewind_numchars + buf_size)))
|
|
|
|
self.stream.seek(self._rewind_checkpoint)
|
|
self._char_seek_forward(self._rewind_numchars, est_bytes)
|
|
filepos = self.stream.tell()
|
|
|
|
# Sanity check
|
|
if self.DEBUG:
|
|
self.stream.seek(filepos)
|
|
check1 = self._incr_decode(self.stream.read(50))[0]
|
|
check2 = ''.join(self.linebuffer)
|
|
assert check1.startswith(check2) or check2.startswith(check1)
|
|
|
|
# Return to our original filepos (so we don't have to throw
|
|
# out our buffer.)
|
|
self.stream.seek(orig_filepos)
|
|
|
|
# Return the calculated filepos
|
|
return filepos
|
|
|
|
#/////////////////////////////////////////////////////////////////
|
|
# Helper methods
|
|
#/////////////////////////////////////////////////////////////////
|
|
|
|
def _read(self, size=None):
|
|
"""
|
|
Read up to ``size`` bytes from the underlying stream, decode
|
|
them using this reader's encoding, and return the resulting
|
|
unicode string. ``linebuffer`` is not included in the result.
|
|
"""
|
|
if size == 0:
|
|
return ''
|
|
|
|
# Skip past the byte order marker, if present.
|
|
if self._bom and self.stream.tell() == 0:
|
|
self.stream.read(self._bom)
|
|
|
|
# Read the requested number of bytes.
|
|
if size is None:
|
|
new_bytes = self.stream.read()
|
|
else:
|
|
new_bytes = self.stream.read(size)
|
|
bytes = self.bytebuffer + new_bytes
|
|
|
|
# Decode the bytes into unicode characters
|
|
chars, bytes_decoded = self._incr_decode(bytes)
|
|
|
|
# If we got bytes but couldn't decode any, then read further.
|
|
if (size is not None) and (not chars) and (len(new_bytes) > 0):
|
|
while not chars:
|
|
new_bytes = self.stream.read(1)
|
|
if not new_bytes:
|
|
break # end of file.
|
|
bytes += new_bytes
|
|
chars, bytes_decoded = self._incr_decode(bytes)
|
|
|
|
# Record any bytes we didn't consume.
|
|
self.bytebuffer = bytes[bytes_decoded:]
|
|
|
|
# Return the result
|
|
return chars
|
|
|
|
def _incr_decode(self, bytes):
|
|
"""
|
|
Decode the given byte string into a unicode string, using this
|
|
reader's encoding. If an exception is encountered that
|
|
appears to be caused by a truncation error, then just decode
|
|
the byte string without the bytes that cause the trunctaion
|
|
error.
|
|
|
|
Return a tuple ``(chars, num_consumed)``, where ``chars`` is
|
|
the decoded unicode string, and ``num_consumed`` is the
|
|
number of bytes that were consumed.
|
|
"""
|
|
while True:
|
|
try:
|
|
return self.decode(bytes, 'strict')
|
|
except UnicodeDecodeError as exc:
|
|
# If the exception occurs at the end of the string,
|
|
# then assume that it's a truncation error.
|
|
if exc.end == len(bytes):
|
|
return self.decode(bytes[:exc.start], self.errors)
|
|
|
|
# Otherwise, if we're being strict, then raise it.
|
|
elif self.errors == 'strict':
|
|
raise
|
|
|
|
# If we're not strict, then re-process it with our
|
|
# errors setting. This *may* raise an exception.
|
|
else:
|
|
return self.decode(bytes, self.errors)
|
|
|
|
_BOM_TABLE = {
|
|
'utf8': [(codecs.BOM_UTF8, None)],
|
|
'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'),
|
|
(codecs.BOM_UTF16_BE, 'utf16-be')],
|
|
'utf16le': [(codecs.BOM_UTF16_LE, None)],
|
|
'utf16be': [(codecs.BOM_UTF16_BE, None)],
|
|
'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'),
|
|
(codecs.BOM_UTF32_BE, 'utf32-be')],
|
|
'utf32le': [(codecs.BOM_UTF32_LE, None)],
|
|
'utf32be': [(codecs.BOM_UTF32_BE, None)],
|
|
}
|
|
|
|
def _check_bom(self):
|
|
# Normalize our encoding name
|
|
enc = re.sub('[ -]', '', self.encoding.lower())
|
|
|
|
# Look up our encoding in the BOM table.
|
|
bom_info = self._BOM_TABLE.get(enc)
|
|
|
|
if bom_info:
|
|
# Read a prefix, to check against the BOM(s)
|
|
bytes = self.stream.read(16)
|
|
self.stream.seek(0)
|
|
|
|
# Check for each possible BOM.
|
|
for (bom, new_encoding) in bom_info:
|
|
if bytes.startswith(bom):
|
|
if new_encoding:
|
|
self.encoding = new_encoding
|
|
return len(bom)
|
|
|
|
return None
|
|
|
|
|
|
__all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile',
|
|
'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer',
|
|
'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load',
|
|
'show_cfg', 'clear_cache', 'LazyLoader', 'OpenOnDemandZipFile',
|
|
'GzipFileSystemPathPointer', 'SeekableUnicodeStreamReader']
|