import pandas as pd from pandas.compat import PY2 import pandas.util.testing as tm import pandas.util._test_decorators as td from pandas.errors import EmptyDataError import os import io import numpy as np import pytest class TestSAS7BDAT(object): @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: fname = os.path.join( self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) df = pd.read_csv(fname) epoch = pd.datetime(1960, 1, 1) t1 = pd.to_timedelta(df["Column4"], unit='d') df["Column4"] = epoch + t1 t2 = pd.to_timedelta(df["Column12"], unit='d') df["Column12"] = epoch + t2 for k in range(df.shape[1]): col = df.iloc[:, k] if col.dtype == np.int64: df.iloc[:, k] = df.iloc[:, k].astype(np.float64) elif col.dtype == np.dtype('O'): if PY2: f = lambda x: (x.decode('utf-8') if isinstance(x, str) else x) df.iloc[:, k] = df.iloc[:, k].apply(f) self.data.append(df) def test_from_file(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k)) df = pd.read_sas(fname, encoding='utf-8') tm.assert_frame_equal(df, df0) def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k)) with open(fname, 'rb') as f: byts = f.read() buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) rdr.close() def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k)) rdr = pd.read_sas(fname, iterator=True, encoding='utf-8') df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) rdr.close() @td.skip_if_no('pathlib') def test_path_pathlib(self): from pathlib import Path for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = Path(os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k))) df = pd.read_sas(fname, encoding='utf-8') tm.assert_frame_equal(df, df0) @td.skip_if_no('py.path') def test_path_localpath(self): from py.path import local as LocalPath for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: fname = LocalPath(os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k))) df = pd.read_sas(fname, encoding='utf-8') tm.assert_frame_equal(df, df0) def test_iterator_loop(self): # github #13654 for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: fname = os.path.join( self.dirpath, "test{k}.sas7bdat".format(k=k)) rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') y = 0 for x in rdr: y += x.shape[0] assert y == rdr.row_count rdr.close() def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding='utf-8') d1 = rdr.read(rdr.row_count + 20) rdr.close() rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") d2 = rdr.read(rdr.row_count + 20) tm.assert_frame_equal(d1, d2) rdr.close() def test_encoding_options(datapath): fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: try: df1[col] = df1[col].str.decode('utf-8') except AttributeError: pass tm.assert_frame_equal(df1, df2) from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() rdr.close() for x, y in zip(df1.columns, df3.columns): assert(x == y.decode()) def test_productsales(datapath): fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) def test_12659(datapath): fname = datapath("io", "sas", "data", "test_12659.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0) def test_airline(datapath): fname = datapath("io", "sas", "data", "airline.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False) def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) # GH 19732: Timestamps imported from sas will incur floating point errors df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') tm.assert_frame_equal(df, df0) def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname)