#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2015 Radim Rehurek # # This code is distributed under the terms and conditions # from the MIT License (MIT). """ Utilities for streaming from several file-like data storages: S3 / HDFS / standard filesystem / compressed files..., using a single, Pythonic API. The streaming makes heavy use of generators and pipes, to avoid loading full file contents into memory, allowing work with arbitrarily large files. The main methods are: * `smart_open()`, which opens the given file for reading/writing * `s3_iter_bucket()`, which goes over all keys in an S3 bucket in parallel """ import codecs import collections import logging import os import os.path as P import sys import requests import importlib import io import warnings # Import ``pathlib`` if the builtin ``pathlib`` or the backport ``pathlib2`` are # available. The builtin ``pathlib`` will be imported with higher precedence. for pathlib_module in ('pathlib', 'pathlib2'): try: pathlib = importlib.import_module(pathlib_module) PATHLIB_SUPPORT = True break except ImportError: PATHLIB_SUPPORT = False from boto.compat import BytesIO, urlsplit, six import boto.s3.key import sys from ssl import SSLError from six.moves.urllib import parse as urlparse IS_PY2 = (sys.version_info[0] == 2) logger = logging.getLogger(__name__) if IS_PY2: from bz2file import BZ2File else: from bz2 import BZ2File import gzip # # This module defines a function called smart_open so we cannot use # smart_open.submodule to reference to the submodules. # import smart_open.s3 as smart_open_s3 from smart_open.s3 import iter_bucket as s3_iter_bucket import smart_open.hdfs as smart_open_hdfs import smart_open.webhdfs as smart_open_webhdfs import smart_open.http as smart_open_http SYSTEM_ENCODING = sys.getdefaultencoding() _ISSUE_146_FSTR = ( "You have explicitly specified encoding=%(encoding)s, but smart_open does " "not currently support decoding text via the %(scheme)s scheme. " "Re-open the file without specifying an encoding to suppress this warning." ) _ISSUE_189_URL = 'https://github.com/RaRe-Technologies/smart_open/issues/189' DEFAULT_ERRORS = 'strict' Uri = collections.namedtuple( 'Uri', ( 'scheme', 'uri_path', 'bucket_id', 'key_id', 'port', 'host', 'ordinary_calling_format', 'access_id', 'access_secret', ) ) """Represents all the options that we parse from user input. Some of the above options only make sense for certain protocols, e.g. bucket_id is only for S3. """ # # Set the default values for all Uri fields to be None. This allows us to only # specify the relevant fields when constructing a Uri. # # https://stackoverflow.com/questions/11351032/namedtuple-and-default-values-for-optional-keyword-arguments # Uri.__new__.__defaults__ = (None,) * len(Uri._fields) def smart_open(uri, mode="rb", **kw): """ Open the given S3 / HDFS / filesystem file pointed to by `uri` for reading or writing. The only supported modes for now are 'rb' (read, default) and 'wb' (replace & write). The reads/writes are memory efficient (streamed) and therefore suitable for arbitrarily large files. The `uri` can be either: 1. a URI for the local filesystem (compressed ``.gz`` or ``.bz2`` files handled automatically): `./lines.txt`, `/home/joe/lines.txt.gz`, `file:///home/joe/lines.txt.bz2` 2. a URI for HDFS: `hdfs:///some/path/lines.txt` 3. a URI for Amazon's S3 (can also supply credentials inside the URI): `s3://my_bucket/lines.txt`, `s3://my_aws_key_id:key_secret@my_bucket/lines.txt` 4. an instance of the boto.s3.key.Key class. 5. an instance of the pathlib.Path class. Examples:: >>> # stream lines from http; you can use context managers too: >>> with smart_open.smart_open('http://www.google.com') as fin: ... for line in fin: ... print line >>> # stream lines from S3; you can use context managers too: >>> with smart_open.smart_open('s3://mybucket/mykey.txt') as fin: ... for line in fin: ... print line >>> # you can also use a boto.s3.key.Key instance directly: >>> key = boto.connect_s3().get_bucket("my_bucket").get_key("my_key") >>> with smart_open.smart_open(key) as fin: ... for line in fin: ... print line >>> # stream line-by-line from an HDFS file >>> for line in smart_open.smart_open('hdfs:///user/hadoop/my_file.txt'): ... print line >>> # stream content *into* S3: >>> with smart_open.smart_open('s3://mybucket/mykey.txt', 'wb') as fout: ... for line in ['first line', 'second line', 'third line']: ... fout.write(line + '\n') >>> # stream from/to (compressed) local files: >>> for line in smart_open.smart_open('/home/radim/my_file.txt'): ... print line >>> for line in smart_open.smart_open('/home/radim/my_file.txt.gz'): ... print line >>> with smart_open.smart_open('/home/radim/my_file.txt.gz', 'wb') as fout: ... fout.write("hello world!\n") >>> with smart_open.smart_open('/home/radim/another.txt.bz2', 'wb') as fout: ... fout.write("good bye!\n") >>> # stream from/to (compressed) local files with Expand ~ and ~user constructions: >>> for line in smart_open.smart_open('~/my_file.txt'): ... print line >>> for line in smart_open.smart_open('my_file.txt'): ... print line """ logger.debug('%r', locals()) if not isinstance(mode, six.string_types): raise TypeError('mode should be a string') fobj = _shortcut_open(uri, mode, **kw) if fobj is not None: return fobj # # This is a work-around for the problem described in Issue #144. # If the user has explicitly specified an encoding, then assume they want # us to open the destination in text mode, instead of the default binary. # # If we change the default mode to be text, and match the normal behavior # of Py2 and 3, then the above assumption will be unnecessary. # if kw.get('encoding') is not None and 'b' in mode: mode = mode.replace('b', '') # Support opening ``pathlib.Path`` objects by casting them to strings. if PATHLIB_SUPPORT and isinstance(uri, pathlib.Path): uri = str(uri) # # Our API is very liberal with keyword arguments, making it a bit hard to # manage them. Capture the keyword arguments we'll be using in this # function in advance to reduce the confusion in downstream functions. # # explicit_encoding is what we've been explicitly told to use. encoding is # what we'll actually end up using. The two may be different if the user # didn't actually specify the encoding. # ignore_extension = kw.pop('ignore_extension', False) explicit_encoding = kw.get('encoding', None) encoding = kw.pop('encoding', SYSTEM_ENCODING) # # This is how we get from the filename to the end result. Decompression is # optional, but it always accepts bytes and returns bytes. # # Decoding is also optional, accepts bytes and returns text. The diagram # below is for reading, for writing, the flow is from right to left, but # the code is identical. # # open as binary decompress? decode? # filename ---------------> bytes -------------> bytes ---------> text # binary decompressed decode # try: binary_mode = {'r': 'rb', 'r+': 'rb+', 'w': 'wb', 'w+': 'wb+', 'a': 'ab', 'a+': 'ab+'}[mode] except KeyError: binary_mode = mode binary, filename = _open_binary_stream(uri, binary_mode, **kw) if ignore_extension: decompressed = binary else: decompressed = _compression_wrapper(binary, filename, mode) if 'b' not in mode or explicit_encoding is not None: errors = kw.pop('errors', 'strict') decoded = _encoding_wrapper(decompressed, mode, encoding=encoding, errors=errors) else: decoded = decompressed return decoded def _shortcut_open(uri, mode, **kw): """Try to open the URI using the standard library io.open function. This can be much faster than the alternative of opening in binary mode and then decoding. This is only possible under the following conditions: 1. Opening a local file 2. Ignore extension is set to True If it is not possible to use the built-in open for the specified URI, returns None. :param str uri: A string indicating what to open. :param str mode: The mode to pass to the open function. :param dict kw: :returns: The opened file :rtype: file """ if not isinstance(uri, six.string_types): return None parsed_uri = _parse_uri(uri) if parsed_uri.scheme != 'file': return None _, extension = P.splitext(parsed_uri.uri_path) ignore_extension = kw.get('ignore_extension', False) if extension in ('.gz', '.bz2') and not ignore_extension: return None open_kwargs = {} errors = kw.get('errors') if errors is not None: open_kwargs['errors'] = errors encoding = kw.get('encoding') if encoding is not None: open_kwargs['encoding'] = encoding mode = mode.replace('b', '') return io.open(parsed_uri.uri_path, mode, **open_kwargs) def _open_binary_stream(uri, mode, **kw): """Open an arbitrary URI in the specified binary mode. Not all modes are supported for all protocols. :arg uri: The URI to open. May be a string, or something else. :arg str mode: The mode to open with. Must be rb, wb or ab. :arg kw: TODO: document this. :returns: A file object and the filename :rtype: tuple """ if mode not in ('rb', 'rb+', 'wb', 'wb+', 'ab', 'ab+'): # # This should really be a ValueError, but for the sake of compatibility # with older versions, which raise NotImplementedError, we do the same. # raise NotImplementedError('unsupported mode: %r' % mode) if isinstance(uri, six.string_types): # this method just routes the request to classes handling the specific storage # schemes, depending on the URI protocol in `uri` filename = uri.split('/')[-1] parsed_uri = _parse_uri(uri) unsupported = "%r mode not supported for %r scheme" % (mode, parsed_uri.scheme) if parsed_uri.scheme in ("file", ): # local files -- both read & write supported # compression, if any, is determined by the filename extension (.gz, .bz2) fobj = io.open(parsed_uri.uri_path, mode) return fobj, filename elif parsed_uri.scheme in ("s3", "s3n", 's3u'): return _s3_open_uri(parsed_uri, mode, **kw), filename elif parsed_uri.scheme in ("hdfs", ): if mode == 'rb': return smart_open_hdfs.CliRawInputBase(parsed_uri.uri_path), filename elif mode == 'wb': return smart_open_hdfs.CliRawOutputBase(parsed_uri.uri_path), filename else: raise NotImplementedError(unsupported) elif parsed_uri.scheme in ("webhdfs", ): if mode == 'rb': fobj = smart_open_webhdfs.BufferedInputBase(parsed_uri.uri_path, **kw) elif mode == 'wb': fobj = smart_open_webhdfs.BufferedOutputBase(parsed_uri.uri_path, **kw) else: raise NotImplementedError(unsupported) return fobj, filename elif parsed_uri.scheme.startswith('http'): # # The URI may contain a query string and fragments, which interfere # with out compressed/uncompressed estimation. # filename = P.basename(urlparse.urlparse(uri).path) if mode == 'rb': return smart_open_http.BufferedInputBase(uri, **kw), filename else: raise NotImplementedError(unsupported) else: raise NotImplementedError("scheme %r is not supported", parsed_uri.scheme) elif isinstance(uri, boto.s3.key.Key): logger.debug('%r', locals()) # # TODO: handle boto3 keys as well # host = kw.pop('host', None) if host is not None: kw['endpoint_url'] = 'http://' + host return smart_open_s3.open(uri.bucket.name, uri.name, mode, **kw), uri.name elif hasattr(uri, 'read'): # simply pass-through if already a file-like filename = '/tmp/unknown' return uri, filename else: raise TypeError('don\'t know how to handle uri %s' % repr(uri)) def _s3_open_uri(parsed_uri, mode, **kwargs): logger.debug('s3_open_uri: %r', locals()) if mode in ('r', 'w'): raise ValueError('this function can only open binary streams. ' 'Use smart_open.smart_open() to open text streams.') elif mode not in ('rb', 'wb'): raise NotImplementedError('unsupported mode: %r', mode) if parsed_uri.access_id is not None: kwargs['aws_access_key_id'] = parsed_uri.access_id if parsed_uri.access_secret is not None: kwargs['aws_secret_access_key'] = parsed_uri.access_secret # Get an S3 host. It is required for sigv4 operations. host = kwargs.pop('host', None) if host is not None: kwargs['endpoint_url'] = 'http://' + host return smart_open_s3.open(parsed_uri.bucket_id, parsed_uri.key_id, mode, **kwargs) def _parse_uri(uri_as_string): """ Parse the given URI from a string. Supported URI schemes are "file", "s3", "s3n", "s3u" and "hdfs". * s3 and s3n are treated the same way. * s3u is s3 but without SSL. Valid URI examples:: * s3://my_bucket/my_key * s3://my_key:my_secret@my_bucket/my_key * s3://my_key:my_secret@my_server:my_port@my_bucket/my_key * hdfs:///path/file * hdfs://path/file * webhdfs://host:port/path/file * ./local/path/file * ~/local/path/file * local/path/file * ./local/path/file.gz * file:///home/user/file * file:///home/user/file.bz2 """ if os.name == 'nt': # urlsplit doesn't work on Windows -- it parses the drive as the scheme... if '://' not in uri_as_string: # no protocol given => assume a local file uri_as_string = 'file://' + uri_as_string parsed_uri = urlsplit(uri_as_string, allow_fragments=False) if parsed_uri.scheme == "hdfs": return _parse_uri_hdfs(parsed_uri) elif parsed_uri.scheme == "webhdfs": return _parse_uri_webhdfs(parsed_uri) elif parsed_uri.scheme in ("s3", "s3n", "s3u"): return _parse_uri_s3x(parsed_uri) elif parsed_uri.scheme in ('file', '', None): return _parse_uri_file(parsed_uri) elif parsed_uri.scheme.startswith('http'): return Uri(scheme=parsed_uri.scheme, uri_path=uri_as_string) else: raise NotImplementedError( "unknown URI scheme %r in %r" % (parsed_uri.scheme, uri_as_string) ) def _parse_uri_hdfs(parsed_uri): assert parsed_uri.scheme == 'hdfs' uri_path = parsed_uri.netloc + parsed_uri.path uri_path = "/" + uri_path.lstrip("/") if not uri_path: raise RuntimeError("invalid HDFS URI: %s" % uri) return Uri(scheme='hdfs', uri_path=uri_path) def _parse_uri_webhdfs(parsed_uri): assert parsed_uri.scheme == 'webhdfs' uri_path = parsed_uri.netloc + "/webhdfs/v1" + parsed_uri.path if parsed_uri.query: uri_path += "?" + parsed_uri.query if not uri_path: raise RuntimeError("invalid WebHDFS URI: %s" % uri) return Uri(scheme='webhdfs', uri_path=uri_path) def _parse_uri_s3x(parsed_uri): assert parsed_uri.scheme in ("s3", "s3n", "s3u") bucket_id = (parsed_uri.netloc + parsed_uri.path).split('@') key_id = None port = 443 host = boto.config.get('s3', 'host', 's3.amazonaws.com') ordinary_calling_format = False if len(bucket_id) == 1: # URI without credentials: s3://bucket/object bucket_id, key_id = bucket_id[0].split('/', 1) # "None" credentials are interpreted as "look for credentials in other locations" by boto access_id, access_secret = None, None elif len(bucket_id) == 2 and len(bucket_id[0].split(':')) == 2: # URI in full format: s3://key:secret@bucket/object # access key id: [A-Z0-9]{20} # secret access key: [A-Za-z0-9/+=]{40} acc, bucket_id = bucket_id access_id, access_secret = acc.split(':') bucket_id, key_id = bucket_id.split('/', 1) elif len(bucket_id) == 3 and len(bucket_id[0].split(':')) == 2: # or URI in extended format: s3://key:secret@server[:port]@bucket/object acc, server, bucket_id = bucket_id access_id, access_secret = acc.split(':') bucket_id, key_id = bucket_id.split('/', 1) server = server.split(':') ordinary_calling_format = True host = server[0] if len(server) == 2: port = int(server[1]) else: # more than 2 '@' means invalid uri # Bucket names must be at least 3 and no more than 63 characters long. # Bucket names must be a series of one or more labels. # Adjacent labels are separated by a single period (.). # Bucket names can contain lowercase letters, numbers, and hyphens. # Each label must start and end with a lowercase letter or a number. raise RuntimeError("invalid S3 URI: %s" % str(parsed_uri)) return Uri( scheme=parsed_uri.scheme, bucket_id=bucket_id, key_id=key_id, port=port, host=host, ordinary_calling_format=ordinary_calling_format, access_id=access_id, access_secret=access_secret ) def _parse_uri_file(parsed_uri): assert parsed_uri.scheme in (None, '', 'file') uri_path = parsed_uri.netloc + parsed_uri.path # '~/tmp' may be expanded to '/Users/username/tmp' uri_path = os.path.expanduser(uri_path) if not uri_path: raise RuntimeError("invalid file URI: %s" % uri) return Uri(scheme='file', uri_path=uri_path) def _need_to_buffer(file_obj, mode, ext): """Returns True if we need to buffer the whole file in memory in order to proceed.""" try: is_seekable = file_obj.seekable() except AttributeError: # # Under Py2, built-in file objects returned by open do not have # .seekable, but have a .seek method instead. # is_seekable = hasattr(file_obj, 'seek') return six.PY2 and mode.startswith('r') and ext in ('.gz', '.bz2') and not is_seekable def _compression_wrapper(file_obj, filename, mode): """ This function will wrap the file_obj with an appropriate [de]compression mechanism based on the extension of the filename. file_obj must either be a filehandle object, or a class which behaves like one. If the filename extension isn't recognized, will simply return the original file_obj. """ _, ext = os.path.splitext(filename) if _need_to_buffer(file_obj, mode, ext): warnings.warn('streaming gzip support unavailable, see %s' % _ISSUE_189_URL) file_obj = io.BytesIO(file_obj.read()) if ext == '.bz2': return BZ2File(file_obj, mode) elif ext == '.gz': return gzip.GzipFile(fileobj=file_obj, mode=mode) else: return file_obj def _encoding_wrapper(fileobj, mode, encoding=None, errors=DEFAULT_ERRORS): """Decode bytes into text, if necessary. If mode specifies binary access, does nothing, unless the encoding is specified. A non-null encoding implies text mode. :arg fileobj: must quack like a filehandle object. :arg str mode: is the mode which was originally requested by the user. :arg str encoding: The text encoding to use. If mode is binary, overrides mode. :arg str errors: The method to use when handling encoding/decoding errors. :returns: a file object """ logger.debug('encoding_wrapper: %r', locals()) # # If the mode is binary, but the user specified an encoding, assume they # want text. If we don't make this assumption, ignore the encoding and # return bytes, smart_open behavior will diverge from the built-in open: # # open(filename, encoding='utf-8') returns a text stream in Py3 # smart_open(filename, encoding='utf-8') would return a byte stream # without our assumption, because the default mode is rb. # if 'b' in mode and encoding is None: return fileobj if encoding is None: encoding = SYSTEM_ENCODING if mode[0] == 'r': decoder = codecs.getreader(encoding) else: decoder = codecs.getwriter(encoding) return decoder(fileobj, errors=errors)