1058 lines
43 KiB
Python
1058 lines
43 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: utf-8 -*-
|
|||
|
#
|
|||
|
# Copyright (C) 2015 Radim Rehurek <me@radimrehurek.com>
|
|||
|
#
|
|||
|
# This code is distributed under the terms and conditions
|
|||
|
# from the MIT License (MIT).
|
|||
|
|
|||
|
import io
|
|||
|
import unittest
|
|||
|
import logging
|
|||
|
import tempfile
|
|||
|
import os
|
|||
|
import sys
|
|||
|
import hashlib
|
|||
|
|
|||
|
import boto3
|
|||
|
import mock
|
|||
|
from moto import mock_s3
|
|||
|
import responses
|
|||
|
import gzip
|
|||
|
import six
|
|||
|
|
|||
|
import smart_open
|
|||
|
from smart_open import smart_open_lib
|
|||
|
|
|||
|
logger = logging.getLogger(__name__)
|
|||
|
|
|||
|
PY2 = sys.version_info[0] == 2
|
|||
|
CURR_DIR = os.path.abspath(os.path.dirname(__file__))
|
|||
|
|
|||
|
|
|||
|
class ParseUriTest(unittest.TestCase):
|
|||
|
"""
|
|||
|
Test ParseUri class.
|
|||
|
|
|||
|
"""
|
|||
|
def test_scheme(self):
|
|||
|
"""Do URIs schemes parse correctly?"""
|
|||
|
# supported schemes
|
|||
|
for scheme in ("s3", "s3n", "hdfs", "file", "http", "https"):
|
|||
|
parsed_uri = smart_open_lib._parse_uri(scheme + "://mybucket/mykey")
|
|||
|
self.assertEqual(parsed_uri.scheme, scheme)
|
|||
|
|
|||
|
# unsupported scheme => NotImplementedError
|
|||
|
self.assertRaises(NotImplementedError, smart_open_lib._parse_uri, "foobar://mybucket/mykey")
|
|||
|
|
|||
|
# unknown scheme => default_scheme
|
|||
|
parsed_uri = smart_open_lib._parse_uri("blah blah")
|
|||
|
self.assertEqual(parsed_uri.scheme, "file")
|
|||
|
|
|||
|
def test_s3_uri(self):
|
|||
|
"""Do S3 URIs parse correctly?"""
|
|||
|
# correct uri without credentials
|
|||
|
parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mykey")
|
|||
|
self.assertEqual(parsed_uri.scheme, "s3")
|
|||
|
self.assertEqual(parsed_uri.bucket_id, "mybucket")
|
|||
|
self.assertEqual(parsed_uri.key_id, "mykey")
|
|||
|
self.assertEqual(parsed_uri.access_id, None)
|
|||
|
self.assertEqual(parsed_uri.access_secret, None)
|
|||
|
|
|||
|
# correct uri, key contains slash
|
|||
|
parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mydir/mykey")
|
|||
|
self.assertEqual(parsed_uri.scheme, "s3")
|
|||
|
self.assertEqual(parsed_uri.bucket_id, "mybucket")
|
|||
|
self.assertEqual(parsed_uri.key_id, "mydir/mykey")
|
|||
|
self.assertEqual(parsed_uri.access_id, None)
|
|||
|
self.assertEqual(parsed_uri.access_secret, None)
|
|||
|
|
|||
|
# correct uri with credentials
|
|||
|
parsed_uri = smart_open_lib._parse_uri("s3://ACCESSID456:acces/sse_cr-et@mybucket/mykey")
|
|||
|
self.assertEqual(parsed_uri.scheme, "s3")
|
|||
|
self.assertEqual(parsed_uri.bucket_id, "mybucket")
|
|||
|
self.assertEqual(parsed_uri.key_id, "mykey")
|
|||
|
self.assertEqual(parsed_uri.access_id, "ACCESSID456")
|
|||
|
self.assertEqual(parsed_uri.access_secret, "acces/sse_cr-et")
|
|||
|
|
|||
|
# correct uri, contains credentials
|
|||
|
parsed_uri = smart_open_lib._parse_uri("s3://accessid:access/secret@mybucket/mykey")
|
|||
|
self.assertEqual(parsed_uri.scheme, "s3")
|
|||
|
self.assertEqual(parsed_uri.bucket_id, "mybucket")
|
|||
|
self.assertEqual(parsed_uri.key_id, "mykey")
|
|||
|
self.assertEqual(parsed_uri.access_id, "accessid")
|
|||
|
self.assertEqual(parsed_uri.access_secret, "access/secret")
|
|||
|
|
|||
|
# incorrect uri - only two '@' in uri are allowed
|
|||
|
self.assertRaises(RuntimeError, smart_open_lib._parse_uri, "s3://access_id@access_secret@mybucket@port/mykey")
|
|||
|
|
|||
|
def test_webhdfs_uri(self):
|
|||
|
"""Do webhdfs URIs parse correctly"""
|
|||
|
# valid uri, no query
|
|||
|
parsed_uri = smart_open_lib._parse_uri("webhdfs://host:port/path/file")
|
|||
|
self.assertEqual(parsed_uri.scheme, "webhdfs")
|
|||
|
self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file")
|
|||
|
|
|||
|
# valid uri, with query
|
|||
|
parsed_uri = smart_open_lib._parse_uri("webhdfs://host:port/path/file?query_part_1&query_part2")
|
|||
|
self.assertEqual(parsed_uri.scheme, "webhdfs")
|
|||
|
self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file?query_part_1&query_part2")
|
|||
|
|
|||
|
|
|||
|
class SmartOpenHttpTest(unittest.TestCase):
|
|||
|
"""
|
|||
|
Test reading from HTTP connections in various ways.
|
|||
|
|
|||
|
"""
|
|||
|
@responses.activate
|
|||
|
def test_http_read(self):
|
|||
|
"""Does http read method work correctly"""
|
|||
|
responses.add(responses.GET, "http://127.0.0.1/index.html",
|
|||
|
body='line1\nline2', stream=True)
|
|||
|
smart_open_object = smart_open.smart_open("http://127.0.0.1/index.html")
|
|||
|
self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2")
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_https_readline(self):
|
|||
|
"""Does https readline method work correctly"""
|
|||
|
responses.add(responses.GET, "https://127.0.0.1/index.html",
|
|||
|
body='line1\nline2', stream=True)
|
|||
|
smart_open_object = smart_open.smart_open("https://127.0.0.1/index.html")
|
|||
|
self.assertEqual(smart_open_object.readline().decode("utf-8"), "line1\n")
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_http_pass(self):
|
|||
|
"""Does http authentication work correctly"""
|
|||
|
responses.add(responses.GET, "http://127.0.0.1/index.html",
|
|||
|
body='line1\nline2', stream=True)
|
|||
|
_ = smart_open.smart_open("http://127.0.0.1/index.html", user='me', password='pass')
|
|||
|
self.assertEqual(len(responses.calls), 1)
|
|||
|
actual_request = responses.calls[0].request
|
|||
|
self.assertTrue('Authorization' in actual_request.headers)
|
|||
|
self.assertTrue(actual_request.headers['Authorization'].startswith('Basic '))
|
|||
|
|
|||
|
@responses.activate
|
|||
|
@unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet')
|
|||
|
def test_http_gz(self):
|
|||
|
"""Can open gzip via http?"""
|
|||
|
fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
|
|||
|
with open(fpath, 'rb') as infile:
|
|||
|
data = infile.read()
|
|||
|
|
|||
|
with gzip.GzipFile(fpath) as fin:
|
|||
|
expected_hash = hashlib.md5(fin.read()).hexdigest()
|
|||
|
|
|||
|
responses.add(responses.GET, "http://127.0.0.1/data.gz", body=data, stream=True)
|
|||
|
smart_open_object = smart_open.smart_open("http://127.0.0.1/data.gz?some_param=some_val")
|
|||
|
|
|||
|
m = hashlib.md5(smart_open_object.read())
|
|||
|
# decompress the gzip and get the same md5 hash
|
|||
|
self.assertEqual(m.hexdigest(), expected_hash)
|
|||
|
|
|||
|
@responses.activate
|
|||
|
@unittest.skipIf(six.PY2, 'gzip support for Py2 is not implemented yet')
|
|||
|
def test_http_gz_noquerystring(self):
|
|||
|
"""Can open gzip via http?"""
|
|||
|
fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
|
|||
|
with open(fpath, 'rb') as infile:
|
|||
|
data = infile.read()
|
|||
|
|
|||
|
with gzip.GzipFile(fpath) as fin:
|
|||
|
expected_hash = hashlib.md5(fin.read()).hexdigest()
|
|||
|
|
|||
|
responses.add(responses.GET, "http://127.0.0.1/data.gz", body=data, stream=True)
|
|||
|
smart_open_object = smart_open.smart_open("http://127.0.0.1/data.gz")
|
|||
|
|
|||
|
m = hashlib.md5(smart_open_object.read())
|
|||
|
# decompress the gzip and get the same md5 hash
|
|||
|
self.assertEqual(m.hexdigest(), expected_hash)
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_http_bz2(self):
|
|||
|
"""Can open bz2 via http?"""
|
|||
|
test_string = b'Hello World Compressed.'
|
|||
|
#
|
|||
|
# TODO: why are these tests writing to temporary files? We can do the
|
|||
|
# bz2 compression in memory.
|
|||
|
#
|
|||
|
with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile:
|
|||
|
test_file = infile.name
|
|||
|
|
|||
|
with smart_open.smart_open(test_file, 'wb') as outfile:
|
|||
|
outfile.write(test_string)
|
|||
|
|
|||
|
with open(test_file, 'rb') as infile:
|
|||
|
compressed_data = infile.read()
|
|||
|
|
|||
|
if os.path.isfile(test_file):
|
|||
|
os.unlink(test_file)
|
|||
|
|
|||
|
responses.add(responses.GET, "http://127.0.0.1/data.bz2",
|
|||
|
body=compressed_data, stream=True)
|
|||
|
smart_open_object = smart_open.smart_open("http://127.0.0.1/data.bz2")
|
|||
|
|
|||
|
# decompress the gzip and get the same md5 hash
|
|||
|
self.assertEqual(smart_open_object.read(), test_string)
|
|||
|
|
|||
|
|
|||
|
class SmartOpenReadTest(unittest.TestCase):
|
|||
|
"""
|
|||
|
Test reading from files under various schemes.
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
def test_shortcut(self):
|
|||
|
fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt')
|
|||
|
with mock.patch('io.open') as mock_open:
|
|||
|
smart_open.smart_open(fpath, 'r').read()
|
|||
|
mock_open.assert_called_with(fpath, 'r')
|
|||
|
|
|||
|
def test_open_with_keywords(self):
|
|||
|
"""This test captures Issue #142."""
|
|||
|
fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt')
|
|||
|
with open(fpath, 'rb') as fin:
|
|||
|
expected = fin.read().decode('cp852')
|
|||
|
with smart_open.smart_open(fpath, encoding='cp852') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(expected, actual)
|
|||
|
|
|||
|
def test_open_with_keywords_explicit_r(self):
|
|||
|
fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt')
|
|||
|
with open(fpath, 'rb') as fin:
|
|||
|
expected = fin.read().decode('cp852')
|
|||
|
with smart_open.smart_open(fpath, mode='r', encoding='cp852') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(expected, actual)
|
|||
|
|
|||
|
@unittest.skipUnless(
|
|||
|
smart_open_lib.PATHLIB_SUPPORT,
|
|||
|
"do not test pathlib support if pathlib or backport are not available")
|
|||
|
def test_open_and_read_pathlib_path(self):
|
|||
|
"""If ``pathlib.Path`` is available we should be able to open and read."""
|
|||
|
from smart_open.smart_open_lib import pathlib
|
|||
|
|
|||
|
fpath = os.path.join(CURR_DIR, 'test_data/cp852.tsv.txt')
|
|||
|
with open(fpath, 'rb') as fin:
|
|||
|
expected = fin.read().decode('cp852')
|
|||
|
with smart_open.smart_open(pathlib.Path(fpath), mode='r', encoding='cp852') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(expected, actual)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_read_never_returns_none(self):
|
|||
|
"""read should never return None."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
|
|||
|
test_string = u"ветер по морю гуляет..."
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
|
|||
|
fout.write(test_string.encode('utf8'))
|
|||
|
|
|||
|
r = smart_open.smart_open("s3://mybucket/mykey", "rb")
|
|||
|
self.assertEqual(r.read(), test_string.encode("utf-8"))
|
|||
|
self.assertEqual(r.read(), b"")
|
|||
|
self.assertEqual(r.read(), b"")
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_readline(self):
|
|||
|
"""Does readline() return the correct file content?"""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
test_string = u"hello žluťoučký world!\nhow are you?".encode('utf8')
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
|
|||
|
fout.write(test_string)
|
|||
|
|
|||
|
reader = smart_open.smart_open("s3://mybucket/mykey", "rb")
|
|||
|
self.assertEqual(reader.readline(), u"hello žluťoučký world!\n".encode("utf-8"))
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_readline_iter(self):
|
|||
|
"""Does __iter__ return the correct file content?"""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
lines = [u"всем привет!\n", u"что нового?"]
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
|
|||
|
fout.write("".join(lines).encode("utf-8"))
|
|||
|
|
|||
|
reader = smart_open.smart_open("s3://mybucket/mykey", "rb")
|
|||
|
|
|||
|
actual_lines = [l.decode("utf-8") for l in reader]
|
|||
|
self.assertEqual(2, len(actual_lines))
|
|||
|
self.assertEqual(lines[0], actual_lines[0])
|
|||
|
self.assertEqual(lines[1], actual_lines[1])
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_readline_eof(self):
|
|||
|
"""Does readline() return empty string on EOF?"""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb"):
|
|||
|
pass
|
|||
|
|
|||
|
reader = smart_open.smart_open("s3://mybucket/mykey", "rb")
|
|||
|
|
|||
|
self.assertEqual(reader.readline(), b"")
|
|||
|
self.assertEqual(reader.readline(), b"")
|
|||
|
self.assertEqual(reader.readline(), b"")
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_s3_iter_lines(self):
|
|||
|
"""Does s3_iter_lines give correct content?"""
|
|||
|
# create fake bucket and fake key
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
test_string = u"hello žluťoučký world!\nhow are you?".encode('utf8')
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb") as fin:
|
|||
|
fin.write(test_string)
|
|||
|
|
|||
|
# call s3_iter_lines and check output
|
|||
|
reader = smart_open.smart_open("s3://mybucket/mykey", "rb")
|
|||
|
output = list(reader)
|
|||
|
self.assertEqual(b''.join(output), test_string)
|
|||
|
|
|||
|
# TODO: add more complex test for file://
|
|||
|
@mock.patch('io.open')
|
|||
|
def test_file(self, mock_smart_open):
|
|||
|
"""Is file:// line iterator called correctly?"""
|
|||
|
prefix = "file://"
|
|||
|
full_path = '/tmp/test.txt'
|
|||
|
read_mode = "rb"
|
|||
|
smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
|
|||
|
smart_open_object.__iter__()
|
|||
|
# called with the correct path?
|
|||
|
mock_smart_open.assert_called_with(full_path, read_mode)
|
|||
|
|
|||
|
full_path = '/tmp/test#hash##more.txt'
|
|||
|
read_mode = "rb"
|
|||
|
smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
|
|||
|
smart_open_object.__iter__()
|
|||
|
# called with the correct path?
|
|||
|
mock_smart_open.assert_called_with(full_path, read_mode)
|
|||
|
|
|||
|
full_path = 'aa#aa'
|
|||
|
read_mode = "rb"
|
|||
|
smart_open_object = smart_open.smart_open(full_path, read_mode)
|
|||
|
smart_open_object.__iter__()
|
|||
|
# called with the correct path?
|
|||
|
mock_smart_open.assert_called_with(full_path, read_mode)
|
|||
|
|
|||
|
short_path = "~/tmp/test.txt"
|
|||
|
full_path = os.path.expanduser(short_path)
|
|||
|
|
|||
|
smart_open_object = smart_open.smart_open(prefix+short_path, read_mode, errors='strict')
|
|||
|
smart_open_object.__iter__()
|
|||
|
# called with the correct expanded path?
|
|||
|
mock_smart_open.assert_called_with(full_path, read_mode, errors='strict')
|
|||
|
|
|||
|
# couldn't find any project for mocking up HDFS data
|
|||
|
# TODO: we want to test also a content of the files, not just fnc call params
|
|||
|
@mock.patch('smart_open.hdfs.subprocess')
|
|||
|
def test_hdfs(self, mock_subprocess):
|
|||
|
"""Is HDFS line iterator called correctly?"""
|
|||
|
mock_subprocess.PIPE.return_value = "test"
|
|||
|
smart_open_object = smart_open.smart_open("hdfs:///tmp/test.txt")
|
|||
|
smart_open_object.__iter__()
|
|||
|
# called with the correct params?
|
|||
|
mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)
|
|||
|
|
|||
|
# second possibility of schema
|
|||
|
smart_open_object = smart_open.smart_open("hdfs://tmp/test.txt")
|
|||
|
smart_open_object.__iter__()
|
|||
|
mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_webhdfs(self):
|
|||
|
"""Is webhdfs line iterator called correctly"""
|
|||
|
responses.add(responses.GET, "http://127.0.0.1:8440/webhdfs/v1/path/file",
|
|||
|
body='line1\nline2', stream=True)
|
|||
|
smart_open_object = smart_open.smart_open("webhdfs://127.0.0.1:8440/path/file")
|
|||
|
iterator = iter(smart_open_object)
|
|||
|
self.assertEqual(next(iterator).decode("utf-8"), "line1\n")
|
|||
|
self.assertEqual(next(iterator).decode("utf-8"), "line2")
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_webhdfs_encoding(self):
|
|||
|
"""Is HDFS line iterator called correctly?"""
|
|||
|
input_url = "webhdfs://127.0.0.1:8440/path/file"
|
|||
|
actual_url = 'http://127.0.0.1:8440/webhdfs/v1/path/file'
|
|||
|
text = u'не для меня прийдёт весна, не для меня дон разольётся'
|
|||
|
body = text.encode('utf-8')
|
|||
|
responses.add(responses.GET, actual_url, body=body, stream=True)
|
|||
|
|
|||
|
actual = smart_open.smart_open(input_url, encoding='utf-8').read()
|
|||
|
self.assertEqual(text, actual)
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_webhdfs_read(self):
|
|||
|
"""Does webhdfs read method work correctly"""
|
|||
|
responses.add(responses.GET, "http://127.0.0.1:8440/webhdfs/v1/path/file",
|
|||
|
body='line1\nline2', stream=True)
|
|||
|
smart_open_object = smart_open.smart_open("webhdfs://127.0.0.1:8440/path/file")
|
|||
|
self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2")
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_s3_iter_moto(self):
|
|||
|
"""Are S3 files iterated over correctly?"""
|
|||
|
# a list of strings to test with
|
|||
|
expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"]
|
|||
|
|
|||
|
# create fake bucket and fake key
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb", s3_min_part_size=5 * 1024**2) as fout:
|
|||
|
# write a single huge line (=full multipart upload)
|
|||
|
fout.write(expected[0] + b'\n')
|
|||
|
|
|||
|
# write lots of small lines
|
|||
|
for lineno, line in enumerate(expected[1:-1]):
|
|||
|
fout.write(line + b'\n')
|
|||
|
|
|||
|
# ...and write the last line too, no newline at the end
|
|||
|
fout.write(expected[-1])
|
|||
|
|
|||
|
# connect to fake s3 and read from the fake key we filled above
|
|||
|
smart_open_object = smart_open.smart_open("s3://mybucket/mykey")
|
|||
|
output = [line.rstrip(b'\n') for line in smart_open_object]
|
|||
|
self.assertEqual(output, expected)
|
|||
|
|
|||
|
# same thing but using a context manager
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey") as smart_open_object:
|
|||
|
output = [line.rstrip(b'\n') for line in smart_open_object]
|
|||
|
self.assertEqual(output, expected)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_s3_read_moto(self):
|
|||
|
"""Are S3 files read correctly?"""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
|
|||
|
# write some bogus key so we can check it below
|
|||
|
content = u"hello wořld\nhow are you?".encode('utf8')
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
|
|||
|
fout.write(content)
|
|||
|
|
|||
|
smart_open_object = smart_open.smart_open("s3://mybucket/mykey")
|
|||
|
self.assertEqual(content[:6], smart_open_object.read(6))
|
|||
|
self.assertEqual(content[6:14], smart_open_object.read(8)) # ř is 2 bytes
|
|||
|
|
|||
|
self.assertEqual(content[14:], smart_open_object.read()) # read the rest
|
|||
|
|
|||
|
@unittest.skip('seek functionality for S3 currently disabled because of Issue #152')
|
|||
|
@mock_s3
|
|||
|
def test_s3_seek_moto(self):
|
|||
|
"""Does seeking in S3 files work correctly?"""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
|
|||
|
# write some bogus key so we can check it below
|
|||
|
content = u"hello wořld\nhow are you?".encode('utf8')
|
|||
|
with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
|
|||
|
fout.write(content)
|
|||
|
|
|||
|
smart_open_object = smart_open.smart_open("s3://mybucket/mykey")
|
|||
|
self.assertEqual(content[:6], smart_open_object.read(6))
|
|||
|
self.assertEqual(content[6:14], smart_open_object.read(8)) # ř is 2 bytes
|
|||
|
|
|||
|
smart_open_object.seek(0)
|
|||
|
self.assertEqual(content, smart_open_object.read()) # no size given => read whole file
|
|||
|
|
|||
|
smart_open_object.seek(0)
|
|||
|
self.assertEqual(content, smart_open_object.read(-1)) # same thing
|
|||
|
|
|||
|
|
|||
|
class SmartOpenS3KwargsTest(unittest.TestCase):
|
|||
|
@mock.patch('boto3.Session')
|
|||
|
def test_no_kwargs(self, mock_session):
|
|||
|
smart_open.smart_open('s3://mybucket/mykey')
|
|||
|
mock_session.assert_called_with(profile_name=None)
|
|||
|
mock_session.return_value.resource.assert_called_with('s3')
|
|||
|
|
|||
|
@mock.patch('boto3.Session')
|
|||
|
def test_credentials(self, mock_session):
|
|||
|
smart_open.smart_open('s3://access_id:access_secret@mybucket/mykey')
|
|||
|
mock_session.assert_called_with(profile_name=None)
|
|||
|
mock_session.return_value.resource.assert_called_with(
|
|||
|
's3', aws_access_key_id='access_id', aws_secret_access_key='access_secret'
|
|||
|
)
|
|||
|
|
|||
|
@mock.patch('boto3.Session')
|
|||
|
def test_profile(self, mock_session):
|
|||
|
smart_open.smart_open('s3://mybucket/mykey', profile_name='my_credentials')
|
|||
|
mock_session.assert_called_with(profile_name='my_credentials')
|
|||
|
mock_session.return_value.resource.assert_called_with('s3')
|
|||
|
|
|||
|
@mock.patch('boto3.Session')
|
|||
|
def test_host(self, mock_session):
|
|||
|
smart_open.smart_open("s3://access_id:access_secret@mybucket/mykey", host='aa.domain.com')
|
|||
|
mock_session.return_value.resource.assert_called_with(
|
|||
|
's3', aws_access_key_id='access_id', aws_secret_access_key='access_secret',
|
|||
|
endpoint_url='http://aa.domain.com'
|
|||
|
)
|
|||
|
|
|||
|
@mock.patch('boto3.Session')
|
|||
|
def test_s3_upload(self, mock_session):
|
|||
|
smart_open.smart_open("s3://bucket/key", 'wb', s3_upload={
|
|||
|
'ServerSideEncryption': 'AES256',
|
|||
|
'ContentType': 'application/json'
|
|||
|
})
|
|||
|
|
|||
|
# Locate the s3.Object instance (mock)
|
|||
|
s3_resource = mock_session.return_value.resource.return_value
|
|||
|
s3_object = s3_resource.Object.return_value
|
|||
|
|
|||
|
# Check that `initiate_multipart_upload` was called
|
|||
|
# with the desired args
|
|||
|
s3_object.initiate_multipart_upload.assert_called_with(
|
|||
|
ServerSideEncryption='AES256',
|
|||
|
ContentType='application/json'
|
|||
|
)
|
|||
|
|
|||
|
def test_session_read_mode(self):
|
|||
|
"""
|
|||
|
Read stream should use a custom boto3.Session
|
|||
|
"""
|
|||
|
session = boto3.Session()
|
|||
|
session.resource = mock.MagicMock()
|
|||
|
|
|||
|
smart_open.smart_open('s3://bucket/key', s3_session=session)
|
|||
|
session.resource.assert_called_with('s3')
|
|||
|
|
|||
|
def test_session_write_mode(self):
|
|||
|
"""
|
|||
|
Write stream should use a custom boto3.Session
|
|||
|
"""
|
|||
|
session = boto3.Session()
|
|||
|
session.resource = mock.MagicMock()
|
|||
|
|
|||
|
smart_open.smart_open('s3://bucket/key', 'wb', s3_session=session)
|
|||
|
session.resource.assert_called_with('s3')
|
|||
|
|
|||
|
|
|||
|
class SmartOpenTest(unittest.TestCase):
|
|||
|
"""
|
|||
|
Test reading and writing from/into files.
|
|||
|
|
|||
|
"""
|
|||
|
@mock.patch('smart_open.smart_open_lib.boto')
|
|||
|
def test_file_mode_mock(self, mock_boto):
|
|||
|
"""Are file:// open modes passed correctly?"""
|
|||
|
as_text = u'куда идём мы с пятачком - большой большой секрет'
|
|||
|
as_bytes = as_text.encode('utf-8')
|
|||
|
|
|||
|
# incorrect file mode
|
|||
|
self.assertRaises(
|
|||
|
NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x"
|
|||
|
)
|
|||
|
|
|||
|
# correct read modes
|
|||
|
#
|
|||
|
# We always open files in binary mode first, but engage
|
|||
|
# encoders/decoders as necessary. Instead of checking how the file
|
|||
|
# _initially_ got opened, we now also check the end result: if the
|
|||
|
# contents got decoded correctly.
|
|||
|
#
|
|||
|
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
|
|||
|
with smart_open.smart_open("blah", "r", encoding='utf-8') as fin:
|
|||
|
self.assertEqual(fin.read(), as_text)
|
|||
|
mock_open.assert_called_with("blah", "r", encoding='utf-8')
|
|||
|
|
|||
|
with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
|
|||
|
with smart_open.smart_open("blah", "rb") as fin:
|
|||
|
self.assertEqual(fin.read(), as_bytes)
|
|||
|
mock_open.assert_called_with("blah", "rb")
|
|||
|
|
|||
|
short_path = "~/blah"
|
|||
|
full_path = os.path.expanduser(short_path)
|
|||
|
with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
|
|||
|
with smart_open.smart_open(short_path, "rb") as fin:
|
|||
|
mock_open.assert_called_with(full_path, "rb")
|
|||
|
|
|||
|
# correct write modes, incorrect scheme
|
|||
|
self.assertRaises(NotImplementedError, smart_open.smart_open, "hdfs:///blah.txt", "wb+")
|
|||
|
self.assertRaises(NotImplementedError, smart_open.smart_open, "http:///blah.txt", "w")
|
|||
|
self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "wb+")
|
|||
|
|
|||
|
# correct write mode, correct file:// URI
|
|||
|
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
|
|||
|
with smart_open.smart_open("blah", "w", encoding='utf-8') as fout:
|
|||
|
mock_open.assert_called_with("blah", "w", encoding='utf-8')
|
|||
|
fout.write(as_text)
|
|||
|
|
|||
|
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
|
|||
|
with smart_open.smart_open("/some/file.txt", "w", encoding='utf-8') as fout:
|
|||
|
mock_open.assert_called_with("/some/file.txt", "w", encoding='utf-8')
|
|||
|
fout.write(as_text)
|
|||
|
|
|||
|
with mock.patch('io.open', mock.Mock(return_value=io.StringIO(as_text))) as mock_open:
|
|||
|
with smart_open.smart_open("/some/file.txt", "w+", encoding='utf-8') as fout:
|
|||
|
mock_open.assert_called_with("/some/file.txt", "w+", encoding='utf-8')
|
|||
|
fout.write(as_text)
|
|||
|
|
|||
|
with mock.patch('io.open', mock.Mock(return_value=io.BytesIO(as_bytes))) as mock_open:
|
|||
|
with smart_open.smart_open("/some/file.txt", "wb+") as fout:
|
|||
|
mock_open.assert_called_with("/some/file.txt", "wb+")
|
|||
|
fout.write(as_bytes)
|
|||
|
|
|||
|
@mock.patch('boto3.Session')
|
|||
|
def test_s3_mode_mock(self, mock_session):
|
|||
|
"""Are s3:// open modes passed correctly?"""
|
|||
|
|
|||
|
# correct write mode, correct s3 URI
|
|||
|
smart_open.smart_open("s3://mybucket/mykey", "w", host='s3.amazonaws.com')
|
|||
|
mock_session.return_value.resource.assert_called_with(
|
|||
|
's3', endpoint_url='http://s3.amazonaws.com'
|
|||
|
)
|
|||
|
|
|||
|
@mock.patch('smart_open.hdfs.subprocess')
|
|||
|
def test_hdfs(self, mock_subprocess):
|
|||
|
"""Is HDFS write called correctly"""
|
|||
|
smart_open_object = smart_open.smart_open("hdfs:///tmp/test.txt", 'wb')
|
|||
|
smart_open_object.write("test")
|
|||
|
# called with the correct params?
|
|||
|
mock_subprocess.Popen.assert_called_with(
|
|||
|
["hdfs", "dfs", "-put", "-f", "-", "/tmp/test.txt"], stdin=mock_subprocess.PIPE
|
|||
|
)
|
|||
|
|
|||
|
# second possibility of schema
|
|||
|
smart_open_object = smart_open.smart_open("hdfs://tmp/test.txt", 'wb')
|
|||
|
smart_open_object.write("test")
|
|||
|
mock_subprocess.Popen.assert_called_with(
|
|||
|
["hdfs", "dfs", "-put", "-f", "-", "/tmp/test.txt"], stdin=mock_subprocess.PIPE
|
|||
|
)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_s3_modes_moto(self):
|
|||
|
"""Do s3:// open modes work correctly?"""
|
|||
|
# fake bucket and key
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
test_string = b"second test"
|
|||
|
|
|||
|
# correct write mode, correct s3 URI
|
|||
|
with smart_open.smart_open("s3://mybucket/newkey", "wb") as fout:
|
|||
|
logger.debug('fout: %r', fout)
|
|||
|
fout.write(test_string)
|
|||
|
|
|||
|
logger.debug("write successfully completed")
|
|||
|
|
|||
|
output = list(smart_open.smart_open("s3://mybucket/newkey", "rb"))
|
|||
|
|
|||
|
self.assertEqual(output, [test_string])
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_s3_metadata_write(self):
|
|||
|
# Read local file fixture
|
|||
|
path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt.gz')
|
|||
|
data = ""
|
|||
|
with smart_open.smart_open(path, 'rb') as fd:
|
|||
|
data = fd.read()
|
|||
|
|
|||
|
# Create a test bucket
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='mybucket')
|
|||
|
|
|||
|
# Write data, with multipart_upload options
|
|||
|
write_stream = smart_open.smart_open(
|
|||
|
's3://mybucket/crime-and-punishment.txt.gz', 'wb',
|
|||
|
s3_upload={
|
|||
|
'ContentType': 'text/plain',
|
|||
|
'ContentEncoding': 'gzip'
|
|||
|
}
|
|||
|
)
|
|||
|
with write_stream as fout:
|
|||
|
fout.write(data)
|
|||
|
|
|||
|
key = s3.Object('mybucket', 'crime-and-punishment.txt.gz')
|
|||
|
self.assertIn('text/plain', key.content_type)
|
|||
|
self.assertEqual(key.content_encoding, 'gzip')
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_write_bad_encoding_strict(self):
|
|||
|
"""Should abort on encoding error."""
|
|||
|
text = u'欲しい気持ちが成長しすぎて'
|
|||
|
|
|||
|
with self.assertRaises(UnicodeEncodeError):
|
|||
|
with tempfile.NamedTemporaryFile('wb', delete=True) as infile:
|
|||
|
with smart_open.smart_open(infile.name, 'w', encoding='koi8-r',
|
|||
|
errors='strict') as fout:
|
|||
|
fout.write(text)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_write_bad_encoding_replace(self):
|
|||
|
"""Should replace characters that failed to encode."""
|
|||
|
text = u'欲しい気持ちが成長しすぎて'
|
|||
|
expected = u'?' * len(text)
|
|||
|
|
|||
|
with tempfile.NamedTemporaryFile('wb', delete=True) as infile:
|
|||
|
with smart_open.smart_open(infile.name, 'w', encoding='koi8-r',
|
|||
|
errors='replace') as fout:
|
|||
|
fout.write(text)
|
|||
|
with smart_open.smart_open(infile.name, 'r', encoding='koi8-r') as fin:
|
|||
|
actual = fin.read()
|
|||
|
|
|||
|
self.assertEqual(expected, actual)
|
|||
|
|
|||
|
|
|||
|
class WebHdfsWriteTest(unittest.TestCase):
|
|||
|
"""
|
|||
|
Test writing into webhdfs files.
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_initialize_write(self):
|
|||
|
def request_callback(_):
|
|||
|
resp_body = ""
|
|||
|
headers = {'location': 'http://127.0.0.1:8440/file'}
|
|||
|
return 307, headers, resp_body
|
|||
|
|
|||
|
responses.add_callback(responses.PUT, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback)
|
|||
|
responses.add(responses.PUT, "http://127.0.0.1:8440/file", status=201)
|
|||
|
smart_open.smart_open("webhdfs://127.0.0.1:8440/path/file", 'wb')
|
|||
|
|
|||
|
assert len(responses.calls) == 2
|
|||
|
path, params = responses.calls[0].request.url.split("?")
|
|||
|
assert path == "http://127.0.0.1:8440/webhdfs/v1/path/file"
|
|||
|
assert params == "overwrite=True&op=CREATE" or params == "op=CREATE&overwrite=True"
|
|||
|
assert responses.calls[1].request.url == "http://127.0.0.1:8440/file"
|
|||
|
|
|||
|
@responses.activate
|
|||
|
def test_write(self):
|
|||
|
def request_callback(_):
|
|||
|
resp_body = ""
|
|||
|
headers = {'location': 'http://127.0.0.1:8440/file'}
|
|||
|
return 307, headers, resp_body
|
|||
|
|
|||
|
responses.add_callback(responses.PUT, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback)
|
|||
|
responses.add(responses.PUT, "http://127.0.0.1:8440/file", status=201)
|
|||
|
smart_open_object = smart_open.smart_open("webhdfs://127.0.0.1:8440/path/file", 'wb')
|
|||
|
|
|||
|
def write_callback(request):
|
|||
|
assert request.body == u"žluťoučký koníček".encode('utf8')
|
|||
|
headers = {}
|
|||
|
return 200, headers, ""
|
|||
|
|
|||
|
test_string = u"žluťoučký koníček".encode('utf8')
|
|||
|
responses.add_callback(responses.POST, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback)
|
|||
|
responses.add_callback(responses.POST, "http://127.0.0.1:8440/file", callback=write_callback)
|
|||
|
smart_open_object.write(test_string)
|
|||
|
smart_open_object.close()
|
|||
|
|
|||
|
assert len(responses.calls) == 4
|
|||
|
assert responses.calls[2].request.url == "http://127.0.0.1:8440/webhdfs/v1/path/file?op=APPEND"
|
|||
|
assert responses.calls[3].request.url == "http://127.0.0.1:8440/file"
|
|||
|
|
|||
|
|
|||
|
class CompressionFormatTest(unittest.TestCase):
|
|||
|
"""
|
|||
|
Test that compression
|
|||
|
"""
|
|||
|
|
|||
|
TEXT = 'Hello'
|
|||
|
|
|||
|
def write_read_assertion(self, test_file):
|
|||
|
with smart_open.smart_open(test_file, 'wb') as fout: # 'b' for binary, needed on Windows
|
|||
|
fout.write(self.TEXT.encode('utf8'))
|
|||
|
|
|||
|
with smart_open.smart_open(test_file, 'rb') as fin:
|
|||
|
self.assertEqual(fin.read().decode('utf8'), self.TEXT)
|
|||
|
|
|||
|
if os.path.isfile(test_file):
|
|||
|
os.unlink(test_file)
|
|||
|
|
|||
|
def test_open_gz(self):
|
|||
|
"""Can open gzip?"""
|
|||
|
fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
|
|||
|
with smart_open.smart_open(fpath) as infile:
|
|||
|
data = infile.read()
|
|||
|
m = hashlib.md5(data)
|
|||
|
assert m.hexdigest() == '18473e60f8c7c98d29d65bf805736a0d', \
|
|||
|
'Failed to read gzip'
|
|||
|
|
|||
|
def test_write_read_gz(self):
|
|||
|
"""Can write and read gzip?"""
|
|||
|
with tempfile.NamedTemporaryFile('wb', suffix='.gz', delete=False) as infile:
|
|||
|
test_file_name = infile.name
|
|||
|
self.write_read_assertion(test_file_name)
|
|||
|
|
|||
|
def test_write_read_bz2(self):
|
|||
|
"""Can write and read bz2?"""
|
|||
|
with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile:
|
|||
|
test_file_name = infile.name
|
|||
|
self.write_read_assertion(test_file_name)
|
|||
|
|
|||
|
|
|||
|
class MultistreamsBZ2Test(unittest.TestCase):
|
|||
|
"""
|
|||
|
Test that multistream bzip2 compressed files can be read.
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
# note: these tests are derived from the Python 3.x tip bz2 tests.
|
|||
|
|
|||
|
TEXT_LINES = [
|
|||
|
b'root:x:0:0:root:/root:/bin/bash\n',
|
|||
|
b'bin:x:1:1:bin:/bin:\n',
|
|||
|
b'daemon:x:2:2:daemon:/sbin:\n',
|
|||
|
b'adm:x:3:4:adm:/var/adm:\n',
|
|||
|
b'lp:x:4:7:lp:/var/spool/lpd:\n',
|
|||
|
b'sync:x:5:0:sync:/sbin:/bin/sync\n',
|
|||
|
b'shutdown:x:6:0:shutdown:/sbin:/sbin/shutdown\n',
|
|||
|
b'halt:x:7:0:halt:/sbin:/sbin/halt\n',
|
|||
|
b'mail:x:8:12:mail:/var/spool/mail:\n',
|
|||
|
b'news:x:9:13:news:/var/spool/news:\n',
|
|||
|
b'uucp:x:10:14:uucp:/var/spool/uucp:\n',
|
|||
|
b'operator:x:11:0:operator:/root:\n',
|
|||
|
b'games:x:12:100:games:/usr/games:\n',
|
|||
|
b'gopher:x:13:30:gopher:/usr/lib/gopher-data:\n',
|
|||
|
b'ftp:x:14:50:FTP User:/var/ftp:/bin/bash\n',
|
|||
|
b'nobody:x:65534:65534:Nobody:/home:\n',
|
|||
|
b'postfix:x:100:101:postfix:/var/spool/postfix:\n',
|
|||
|
b'niemeyer:x:500:500::/home/niemeyer:/bin/bash\n',
|
|||
|
b'postgres:x:101:102:PostgreSQL Server:/var/lib/pgsql:/bin/bash\n',
|
|||
|
b'mysql:x:102:103:MySQL server:/var/lib/mysql:/bin/bash\n',
|
|||
|
b'www:x:103:104::/var/www:/bin/false\n',
|
|||
|
]
|
|||
|
|
|||
|
TEXT = b''.join(TEXT_LINES)
|
|||
|
|
|||
|
DATA = \
|
|||
|
b'BZh91AY&SY.\xc8N\x18\x00\x01>_\x80\x00\x10@\x02\xff\xf0\x01\x07n\x00?\xe7\xff\xe00\x01\x99\xaa\x00' \
|
|||
|
b'\xc0\x03F\x86\x8c#&\x83F\x9a\x03\x06\xa6\xd0\xa6\x93M\x0fQ\xa7\xa8\x06\x804hh\x12$\x11\xa4i4\xf14S' \
|
|||
|
b'\xd2<Q\xb5\x0fH\xd3\xd4\xdd\xd5\x87\xbb\xf8\x94\r\x8f\xafI\x12\xe1\xc9\xf8/E\x00pu\x89\x12]\xc9' \
|
|||
|
b'\xbbDL\nQ\x0e\t1\x12\xdf\xa0\xc0\x97\xac2O9\x89\x13\x94\x0e\x1c7\x0ed\x95I\x0c\xaaJ\xa4\x18L\x10' \
|
|||
|
b'\x05#\x9c\xaf\xba\xbc/\x97\x8a#C\xc8\xe1\x8cW\xf9\xe2\xd0\xd6M\xa7\x8bXa<e\x84t\xcbL\xb3\xa7\xd9' \
|
|||
|
b'\xcd\xd1\xcb\x84.\xaf\xb3\xab\xab\xad`n}\xa0lh\tE,\x8eZ\x15\x17VH>\x88\xe5\xcd9gd6\x0b\n\xe9\x9b' \
|
|||
|
b'\xd5\x8a\x99\xf7\x08.K\x8ev\xfb\xf7xw\xbb\xdf\xa1\x92\xf1\xdd|/";\xa2\xba\x9f\xd5\xb1#A\xb6\xf6' \
|
|||
|
b'\xb3o\xc9\xc5y\\\xebO\xe7\x85\x9a\xbc\xb6f8\x952\xd5\xd7"%\x89>V,\xf7\xa6z\xe2\x9f\xa3\xdf\x11' \
|
|||
|
b'\x11"\xd6E)I\xa9\x13^\xca\xf3r\xd0\x03U\x922\xf26\xec\xb6\xed\x8b\xc3U\x13\x9d\xc5\x170\xa4\xfa^' \
|
|||
|
b'\x92\xacDF\x8a\x97\xd6\x19\xfe\xdd\xb8\xbd\x1a\x9a\x19\xa3\x80ankR\x8b\xe5\xd83]\xa9\xc6\x08' \
|
|||
|
b'\x82f\xf6\xb9"6l$\xb8j@\xc0\x8a\xb0l1..\xbak\x83ls\x15\xbc\xf4\xc1\x13\xbe\xf8E\xb8\x9d\r\xa8\x9dk' \
|
|||
|
b'\x84\xd3n\xfa\xacQ\x07\xb1%y\xaav\xb4\x08\xe0z\x1b\x16\xf5\x04\xe9\xcc\xb9\x08z\x1en7.G\xfc]\xc9\x14' \
|
|||
|
b'\xe1B@\xbb!8`'
|
|||
|
|
|||
|
def create_temp_bz2(self, streams=1):
|
|||
|
f = tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False)
|
|||
|
f.write(self.DATA * streams)
|
|||
|
f.close()
|
|||
|
return f.name
|
|||
|
|
|||
|
def cleanup_temp_bz2(self, test_file):
|
|||
|
if os.path.isfile(test_file):
|
|||
|
os.unlink(test_file)
|
|||
|
|
|||
|
def test_can_read_multistream_bz2(self):
|
|||
|
if PY2:
|
|||
|
# this is a backport from Python 3
|
|||
|
from bz2file import BZ2File
|
|||
|
else:
|
|||
|
from bz2 import BZ2File
|
|||
|
|
|||
|
test_file = self.create_temp_bz2(streams=5)
|
|||
|
with BZ2File(test_file) as bz2f:
|
|||
|
self.assertEqual(bz2f.read(), self.TEXT * 5)
|
|||
|
self.cleanup_temp_bz2(test_file)
|
|||
|
|
|||
|
def test_python2_stdlib_bz2_cannot_read_multistream(self):
|
|||
|
# Multistream bzip is included in Python 3
|
|||
|
if not PY2:
|
|||
|
return
|
|||
|
import bz2
|
|||
|
|
|||
|
test_file = self.create_temp_bz2(streams=5)
|
|||
|
bz2f = bz2.BZ2File(test_file)
|
|||
|
self.assertNotEqual(bz2f.read(), self.TEXT * 5)
|
|||
|
bz2f.close()
|
|||
|
self.cleanup_temp_bz2(test_file)
|
|||
|
|
|||
|
def test_file_smart_open_can_read_multistream_bz2(self):
|
|||
|
test_file = self.create_temp_bz2(streams=5)
|
|||
|
with smart_open_lib.smart_open(test_file) as bz2f:
|
|||
|
self.assertEqual(bz2f.read(), self.TEXT * 5)
|
|||
|
self.cleanup_temp_bz2(test_file)
|
|||
|
|
|||
|
|
|||
|
class S3OpenTest(unittest.TestCase):
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_r(self):
|
|||
|
"""Reading a UTF string should work."""
|
|||
|
text = u"физкульт-привет!"
|
|||
|
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = s3.Object('bucket', 'key')
|
|||
|
key.put(Body=text.encode('utf-8'))
|
|||
|
|
|||
|
with smart_open.smart_open('s3://bucket/key', "rb") as fin:
|
|||
|
self.assertEqual(fin.read(), text.encode('utf-8'))
|
|||
|
|
|||
|
with smart_open.smart_open('s3://bucket/key', "r", encoding='utf-8') as fin:
|
|||
|
self.assertEqual(fin.read(), text)
|
|||
|
|
|||
|
def test_bad_mode(self):
|
|||
|
"""Bad mode should raise and exception."""
|
|||
|
uri = smart_open_lib._parse_uri("s3://bucket/key")
|
|||
|
self.assertRaises(NotImplementedError, smart_open.smart_open, uri, "x")
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_rw_encoding(self):
|
|||
|
"""Should read and write text, respecting encodings, etc."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
|
|||
|
key = "s3://bucket/key"
|
|||
|
text = u"расцветали яблони и груши"
|
|||
|
|
|||
|
with smart_open.smart_open(key, "w", encoding="koi8-r") as fout:
|
|||
|
fout.write(text)
|
|||
|
|
|||
|
with smart_open.smart_open(key, "r", encoding="koi8-r") as fin:
|
|||
|
self.assertEqual(text, fin.read())
|
|||
|
|
|||
|
with smart_open.smart_open(key, "rb") as fin:
|
|||
|
self.assertEqual(text.encode("koi8-r"), fin.read())
|
|||
|
|
|||
|
with smart_open.smart_open(key, "r", encoding="euc-jp") as fin:
|
|||
|
self.assertRaises(UnicodeDecodeError, fin.read)
|
|||
|
|
|||
|
with smart_open.smart_open(key, "r", encoding="euc-jp", errors="replace") as fin:
|
|||
|
fin.read()
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_rw_gzip(self):
|
|||
|
"""Should read/write gzip files, implicitly and explicitly."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.gz"
|
|||
|
|
|||
|
text = u"не слышны в саду даже шорохи"
|
|||
|
with smart_open.smart_open(key, "wb") as fout:
|
|||
|
fout.write(text.encode("utf-8"))
|
|||
|
|
|||
|
#
|
|||
|
# Check that what we've created is a gzip.
|
|||
|
#
|
|||
|
with smart_open.smart_open(key, "rb", ignore_extension=True) as fin:
|
|||
|
gz = gzip.GzipFile(fileobj=fin)
|
|||
|
self.assertEqual(gz.read().decode("utf-8"), text)
|
|||
|
|
|||
|
#
|
|||
|
# We should be able to read it back as well.
|
|||
|
#
|
|||
|
with smart_open.smart_open(key, "rb") as fin:
|
|||
|
self.assertEqual(fin.read().decode("utf-8"), text)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_gzip_write_mode(self):
|
|||
|
"""Should always open in binary mode when writing through a codec."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
uri = smart_open_lib._parse_uri("s3://bucket/key.gz")
|
|||
|
|
|||
|
with mock.patch('smart_open.smart_open_s3.open') as mock_open:
|
|||
|
smart_open.smart_open("s3://bucket/key.gz", "wb")
|
|||
|
mock_open.assert_called_with('bucket', 'key.gz', 'wb')
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_gzip_read_mode(self):
|
|||
|
"""Should always open in binary mode when reading through a codec."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.gz"
|
|||
|
|
|||
|
text = u"если-б я был султан и имел трёх жён, то тройной красотой был бы окружён"
|
|||
|
with smart_open.smart_open(key, "wb") as fout:
|
|||
|
fout.write(text.encode("utf-8"))
|
|||
|
|
|||
|
with mock.patch('smart_open.smart_open_s3.open') as mock_open:
|
|||
|
smart_open.smart_open(key, "r")
|
|||
|
mock_open.assert_called_with('bucket', 'key.gz', 'rb')
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_read_encoding(self):
|
|||
|
"""Should open the file with the correct encoding, explicit text read."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.txt"
|
|||
|
text = u'это знала ева, это знал адам, колеса любви едут прямо по нам'
|
|||
|
with smart_open.smart_open(key, 'wb') as fout:
|
|||
|
fout.write(text.encode('koi8-r'))
|
|||
|
with smart_open.smart_open(key, 'r', encoding='koi8-r') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(text, actual)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_read_encoding_implicit_text(self):
|
|||
|
"""Should open the file with the correct encoding, implicit text read."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.txt"
|
|||
|
text = u'это знала ева, это знал адам, колеса любви едут прямо по нам'
|
|||
|
with smart_open.smart_open(key, 'wb') as fout:
|
|||
|
fout.write(text.encode('koi8-r'))
|
|||
|
with smart_open.smart_open(key, encoding='koi8-r') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(text, actual)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_write_encoding(self):
|
|||
|
"""Should open the file for writing with the correct encoding."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.txt"
|
|||
|
text = u'какая боль, какая боль, аргентина - ямайка, 5-0'
|
|||
|
|
|||
|
with smart_open.smart_open(key, 'w', encoding='koi8-r') as fout:
|
|||
|
fout.write(text)
|
|||
|
with smart_open.smart_open(key, encoding='koi8-r') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(text, actual)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_write_bad_encoding_strict(self):
|
|||
|
"""Should open the file for writing with the correct encoding."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.txt"
|
|||
|
text = u'欲しい気持ちが成長しすぎて'
|
|||
|
|
|||
|
with self.assertRaises(UnicodeEncodeError):
|
|||
|
with smart_open.smart_open(key, 'w', encoding='koi8-r', errors='strict') as fout:
|
|||
|
fout.write(text)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_write_bad_encoding_replace(self):
|
|||
|
"""Should open the file for writing with the correct encoding."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.txt"
|
|||
|
text = u'欲しい気持ちが成長しすぎて'
|
|||
|
expected = u'?' * len(text)
|
|||
|
|
|||
|
with smart_open.smart_open(key, 'w', encoding='koi8-r', errors='replace') as fout:
|
|||
|
fout.write(text)
|
|||
|
with smart_open.smart_open(key, encoding='koi8-r') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(expected, actual)
|
|||
|
|
|||
|
@mock_s3
|
|||
|
def test_write_text_gzip(self):
|
|||
|
"""Should open the file for writing with the correct encoding."""
|
|||
|
s3 = boto3.resource('s3')
|
|||
|
s3.create_bucket(Bucket='bucket')
|
|||
|
key = "s3://bucket/key.txt.gz"
|
|||
|
text = u'какая боль, какая боль, аргентина - ямайка, 5-0'
|
|||
|
|
|||
|
with smart_open.smart_open(key, 'w', encoding='utf-8') as fout:
|
|||
|
fout.write(text)
|
|||
|
with smart_open.smart_open(key, 'r', encoding='utf-8') as fin:
|
|||
|
actual = fin.read()
|
|||
|
self.assertEqual(text, actual)
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
|
|||
|
unittest.main()
|