""" Tests for the pandas.io.common functionalities """ import mmap import pytest import os from os.path import isabs import pandas as pd import pandas.util.testing as tm import pandas.util._test_decorators as td from pandas.io import common from pandas.compat import is_platform_windows, StringIO, FileNotFoundError from pandas import read_csv, concat class CustomFSPath(object): """For testing fspath on unknown objects""" def __init__(self, path): self.path = path def __fspath__(self): return self.path # Functions that consume a string path and return a string or path-like object path_types = [str, CustomFSPath] try: from pathlib import Path path_types.append(Path) except ImportError: pass try: from py.path import local as LocalPath path_types.append(LocalPath) except ImportError: pass HERE = os.path.abspath(os.path.dirname(__file__)) class TestCommonIOCapabilities(object): data1 = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 """ def test_expand_user(self): filename = '~/sometest' expanded_name = common._expand_user(filename) assert expanded_name != filename assert isabs(expanded_name) assert os.path.expanduser(filename) == expanded_name def test_expand_user_normal_path(self): filename = '/somefolder/sometest' expanded_name = common._expand_user(filename) assert expanded_name == filename assert os.path.expanduser(filename) == expanded_name @td.skip_if_no('pathlib') def test_stringify_path_pathlib(self): rel_path = common._stringify_path(Path('.')) assert rel_path == '.' redundant_path = common._stringify_path(Path('foo//bar')) assert redundant_path == os.path.join('foo', 'bar') @td.skip_if_no('py.path') def test_stringify_path_localpath(self): path = os.path.join('foo', 'bar') abs_path = os.path.abspath(path) lpath = LocalPath(path) assert common._stringify_path(lpath) == abs_path def test_stringify_path_fspath(self): p = CustomFSPath('foo/bar.csv') result = common._stringify_path(p) assert result == 'foo/bar.csv' @pytest.mark.parametrize('extension,expected', [ ('', None), ('.gz', 'gzip'), ('.bz2', 'bz2'), ('.zip', 'zip'), ('.xz', 'xz'), ]) @pytest.mark.parametrize('path_type', path_types) def test_infer_compression_from_path(self, extension, expected, path_type): path = path_type('foo/bar.csv' + extension) compression = common._infer_compression(path, compression='infer') assert compression == expected def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( filename) assert filepath_or_buffer != filename assert isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer assert not should_close def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( input_buffer) assert filepath_or_buffer == input_buffer assert not should_close def test_iterator(self): reader = read_csv(StringIO(self.data1), chunksize=1) result = concat(reader, ignore_index=True) expected = read_csv(StringIO(self.data1)) tm.assert_frame_equal(result, expected) # GH12153 it = read_csv(StringIO(self.data1), chunksize=1) first = next(it) tm.assert_frame_equal(first, expected.iloc[[0]]) tm.assert_frame_equal(concat(it), expected.iloc[1:]) @pytest.mark.parametrize('reader, module, error_class, fn_ext', [ (pd.read_csv, 'os', FileNotFoundError, 'csv'), (pd.read_table, 'os', FileNotFoundError, 'csv'), (pd.read_fwf, 'os', FileNotFoundError, 'txt'), (pd.read_excel, 'xlrd', FileNotFoundError, 'xlsx'), (pd.read_feather, 'feather', Exception, 'feather'), (pd.read_hdf, 'tables', FileNotFoundError, 'h5'), (pd.read_stata, 'os', FileNotFoundError, 'dta'), (pd.read_sas, 'os', FileNotFoundError, 'sas7bdat'), (pd.read_json, 'os', ValueError, 'json'), (pd.read_msgpack, 'os', ValueError, 'mp'), (pd.read_pickle, 'os', FileNotFoundError, 'pickle'), ]) def test_read_non_existant(self, reader, module, error_class, fn_ext): pytest.importorskip(module) path = os.path.join(HERE, 'data', 'does_not_exist.' + fn_ext) with pytest.raises(error_class): reader(path) @pytest.mark.parametrize('reader, module, path', [ (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), (pd.read_table, 'os', ('io', 'data', 'iris.csv')), (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', 'datetimetz_object.h5')), (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), ]) def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) path = datapath(*path) mypath = CustomFSPath(path) result = reader(mypath) expected = reader(path) if path.endswith('.pickle'): # categorical tm.assert_categorical_equal(result, expected) else: tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('writer_name, writer_kwargs, module', [ ('to_csv', {}, 'os'), ('to_excel', {'engine': 'xlwt'}, 'xlwt'), ('to_feather', {}, 'feather'), ('to_html', {}, 'os'), ('to_json', {}, 'os'), ('to_latex', {}, 'os'), ('to_msgpack', {}, 'os'), ('to_pickle', {}, 'os'), ('to_stata', {}, 'os'), ]) def test_write_fspath_all(self, writer_name, writer_kwargs, module): p1 = tm.ensure_clean('string') p2 = tm.ensure_clean('fspath') df = pd.DataFrame({"A": [1, 2]}) with p1 as string, p2 as fspath: pytest.importorskip(module) mypath = CustomFSPath(fspath) writer = getattr(df, writer_name) writer(string, **writer_kwargs) with open(string, 'rb') as f: expected = f.read() writer(mypath, **writer_kwargs) with open(fspath, 'rb') as f: result = f.read() assert result == expected def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll # have to read and compare equality pytest.importorskip('tables') df = pd.DataFrame({"A": [1, 2]}) p1 = tm.ensure_clean('string') p2 = tm.ensure_clean('fspath') with p1 as string, p2 as fspath: mypath = CustomFSPath(fspath) df.to_hdf(mypath, key='bar') df.to_hdf(string, key='bar') result = pd.read_hdf(fspath, key='bar') expected = pd.read_hdf(string, key='bar') tm.assert_frame_equal(result, expected) @pytest.fixture def mmap_file(datapath): return datapath('io', 'data', 'test_mmap.csv') class TestMMapWrapper(object): def test_constructor_bad_file(self, mmap_file): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 # the error raised is different on Windows if is_platform_windows(): msg = "The parameter is incorrect" err = OSError else: msg = "[Errno 22]" err = mmap.error tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assert_raises_regex( ValueError, msg, common.MMapWrapper, target) def test_get_attr(self, mmap_file): with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) attrs = [attr for attr in attrs if not attr.startswith('__')] attrs.append('__next__') for attr in attrs: assert hasattr(wrapper, attr) assert not hasattr(wrapper, 'foo') def test_next(self, mmap_file): with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) lines = target.readlines() for line in lines: next_line = next(wrapper) assert next_line.strip() == line.strip() pytest.raises(StopIteration, next, wrapper) def test_unknown_engine(self): with tm.ensure_clean() as path: df = tm.makeDataFrame() df.to_csv(path) with tm.assert_raises_regex(ValueError, 'Unknown engine'): read_csv(path, engine='pyt')