139 lines
4.7 KiB
Python
139 lines
4.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Tests compressed data parsing functionality for all
|
|
of the parsers defined in parsers.py
|
|
"""
|
|
|
|
import pytest
|
|
|
|
import pandas as pd
|
|
import pandas.compat as compat
|
|
import pandas.util.testing as tm
|
|
import pandas.util._test_decorators as td
|
|
|
|
import gzip
|
|
import bz2
|
|
try:
|
|
lzma = compat.import_lzma()
|
|
except ImportError:
|
|
lzma = None
|
|
|
|
|
|
class CompressionTests(object):
|
|
|
|
def test_zip(self):
|
|
import zipfile
|
|
|
|
with open(self.csv1, 'rb') as data_file:
|
|
data = data_file.read()
|
|
expected = self.read_csv(self.csv1)
|
|
|
|
with tm.ensure_clean('test_file.zip') as path:
|
|
tmp = zipfile.ZipFile(path, mode='w')
|
|
tmp.writestr('test_file', data)
|
|
tmp.close()
|
|
|
|
result = self.read_csv(path, compression='zip')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = self.read_csv(path, compression='infer')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
if self.engine is not 'python':
|
|
with open(path, 'rb') as f:
|
|
result = self.read_csv(f, compression='zip')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
with tm.ensure_clean('combined_zip.zip') as path:
|
|
inner_file_names = ['test_file', 'second_file']
|
|
tmp = zipfile.ZipFile(path, mode='w')
|
|
for file_name in inner_file_names:
|
|
tmp.writestr(file_name, data)
|
|
tmp.close()
|
|
|
|
tm.assert_raises_regex(ValueError, 'Multiple files',
|
|
self.read_csv, path, compression='zip')
|
|
|
|
tm.assert_raises_regex(ValueError, 'Multiple files',
|
|
self.read_csv, path,
|
|
compression='infer')
|
|
|
|
with tm.ensure_clean() as path:
|
|
tmp = zipfile.ZipFile(path, mode='w')
|
|
tmp.close()
|
|
|
|
tm.assert_raises_regex(ValueError, 'Zero files',
|
|
self.read_csv, path, compression='zip')
|
|
|
|
with tm.ensure_clean() as path:
|
|
with open(path, 'wb') as f:
|
|
pytest.raises(zipfile.BadZipfile, self.read_csv,
|
|
f, compression='zip')
|
|
|
|
@pytest.mark.parametrize('compress_type, compress_method, ext', [
|
|
('gzip', gzip.GzipFile, 'gz'),
|
|
('bz2', bz2.BZ2File, 'bz2'),
|
|
pytest.param('xz', getattr(lzma, 'LZMAFile', None), 'xz',
|
|
marks=td.skip_if_no_lzma)
|
|
])
|
|
def test_other_compression(self, compress_type, compress_method, ext):
|
|
|
|
with open(self.csv1, 'rb') as data_file:
|
|
data = data_file.read()
|
|
expected = self.read_csv(self.csv1)
|
|
|
|
with tm.ensure_clean() as path:
|
|
tmp = compress_method(path, mode='wb')
|
|
tmp.write(data)
|
|
tmp.close()
|
|
|
|
result = self.read_csv(path, compression=compress_type)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
if compress_type == 'bz2':
|
|
pytest.raises(ValueError, self.read_csv,
|
|
path, compression='bz3')
|
|
|
|
with open(path, 'rb') as fin:
|
|
result = self.read_csv(fin, compression=compress_type)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
with tm.ensure_clean('test.{}'.format(ext)) as path:
|
|
tmp = compress_method(path, mode='wb')
|
|
tmp.write(data)
|
|
tmp.close()
|
|
result = self.read_csv(path, compression='infer')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_read_csv_infer_compression(self):
|
|
# see gh-9770
|
|
expected = self.read_csv(self.csv1, index_col=0, parse_dates=True)
|
|
|
|
with open(self.csv1) as f:
|
|
inputs = [self.csv1, self.csv1 + '.gz',
|
|
self.csv1 + '.bz2', f]
|
|
|
|
for inp in inputs:
|
|
df = self.read_csv(inp, index_col=0, parse_dates=True,
|
|
compression='infer')
|
|
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
def test_read_csv_compressed_utf16_example(self, datapath):
|
|
# GH18071
|
|
path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip')
|
|
|
|
result = self.read_csv(path, encoding='utf-16',
|
|
compression='zip', sep='\t')
|
|
expected = pd.DataFrame({
|
|
u'Country': [u'Venezuela', u'Venezuela'],
|
|
u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.']
|
|
})
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_invalid_compression(self):
|
|
msg = 'Unrecognized compression type: sfark'
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv('test_file.zip', compression='sfark')
|