# -*- coding: utf-8 -*- # pylint: disable=E1101 import datetime as dt import io import gzip import os import struct import warnings from collections import OrderedDict from datetime import datetime import numpy as np import pytest import pandas as pd import pandas.util.testing as tm from pandas import compat from pandas.compat import iterkeys from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.frame import DataFrame, Series from pandas.io.parsers import read_csv from pandas.io.stata import (InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, StataReader, read_stata) @pytest.fixture def dirpath(datapath): return datapath("io", "data") @pytest.fixture def parsed_114(dirpath): dta14_114 = os.path.join(dirpath, 'stata5_114.dta') parsed_114 = read_stata(dta14_114, convert_dates=True) parsed_114.index.name = 'index' return parsed_114 class TestStata(object): @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "data") self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') self.csv14 = os.path.join(self.dirpath, 'stata5.csv') self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta') self.csv15 = os.path.join(self.dirpath, 'stata6.csv') self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta') self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta') self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta') self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta') self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta') self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta') self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta') self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta') self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta') self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta') self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta') self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta') self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta') self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta') self.dta23 = os.path.join(self.dirpath, 'stata15.dta') self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta') self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True) def read_csv(self, file): return read_csv(file, parse_dates=True) @pytest.mark.parametrize('version', [114, 117]) def test_read_empty_dta(self, version): empty_ds = DataFrame(columns=['unit']) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path, write_index=False, version=version) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2) def test_data_method(self): # Minimal testing of legacy data method with StataReader(self.dta1_114) as rdr: with warnings.catch_warnings(record=True) as w: # noqa parsed_114_data = rdr.data() with StataReader(self.dta1_114) as rdr: parsed_114_read = rdr.read() tm.assert_frame_equal(parsed_114_data, parsed_114_read) @pytest.mark.parametrize( 'file', ['dta1_114', 'dta1_117']) def test_read_dta1(self, file): file = getattr(self, file) parsed = self.read_dta(file) # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) # this is an oddity as really the nan should be float64, but # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) tm.assert_frame_equal(parsed, expected) def test_read_dta2(self): expected = DataFrame.from_records( [ ( datetime(2006, 11, 19, 23, 13, 20), 1479596223000, datetime(2010, 1, 20), datetime(2010, 1, 8), datetime(2010, 1, 1), datetime(1974, 7, 1), datetime(2010, 1, 1), datetime(2010, 1, 1) ), ( datetime(1959, 12, 31, 20, 3, 20), -1479590, datetime(1953, 10, 2), datetime(1948, 6, 10), datetime(1955, 1, 1), datetime(1955, 7, 1), datetime(1955, 1, 1), datetime(2, 1, 1) ), ( pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, ) ], columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date', 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] ) expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") parsed_114 = self.read_dta(self.dta2_114) parsed_115 = self.read_dta(self.dta2_115) parsed_117 = self.read_dta(self.dta2_117) # 113 is buggy due to limits of date format support in Stata # parsed_113 = self.read_dta(self.dta2_113) # Remove resource warnings w = [x for x in w if x.category is UserWarning] # should get warning for each call to read_dta assert len(w) == 3 # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) @pytest.mark.parametrize( 'file', ['dta3_113', 'dta3_114', 'dta3_115', 'dta3_117']) def test_read_dta3(self, file): file = getattr(self, file) parsed = self.read_dta(file) # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) expected['year'] = expected['year'].astype(np.int16) expected['quarter'] = expected['quarter'].astype(np.int8) tm.assert_frame_equal(parsed, expected) @pytest.mark.parametrize( 'file', ['dta4_113', 'dta4_114', 'dta4_115', 'dta4_117']) def test_read_dta4(self, file): file = getattr(self, file) parsed = self.read_dta(file) expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], ["two", "nine", "two", "two", "two"], ["three", "eight", "three", "three", "three"], ["four", "seven", 4, "four", "four"], ["five", "six", 5, np.nan, "five"], ["six", "five", 6, np.nan, "six"], ["seven", "four", 7, np.nan, "seven"], ["eight", "three", 8, np.nan, "eight"], ["nine", "two", 9, np.nan, "nine"], ["ten", "one", "ten", np.nan, "ten"] ], columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) # these are all categoricals expected = pd.concat([expected[col].astype('category') for col in expected], axis=1) # stata doesn't save .category metadata tm.assert_frame_equal(parsed, expected, check_categorical=False) # File containing strls def test_read_dta12(self): parsed_117 = self.read_dta(self.dta21_117) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], [3, "cba", "qwertywertyqwerty"], [93, "", "strl"], ], columns=['x', 'y', 'z']) tm.assert_frame_equal(parsed_117, expected, check_dtype=False) def test_read_dta18(self): parsed_118 = self.read_dta(self.dta22_118) parsed_118["Bytes"] = parsed_118["Bytes"].astype('O') expected = DataFrame.from_records( [['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0], ['Dog', 'Boston', u'Uzunköprü', np.nan, np.nan, np.nan, np.nan], ['Plane', 'Rome', u'Tromsø', 0, 0.0, 'option a', 0.0], ['Potato', 'Tokyo', u'Elâzığ', -4, 4.0, 4, 4], ['', '', '', 0, 0.3332999, 'option a', 1 / 3.] ], columns=['Things', 'Cities', 'Unicode_Cities_Strl', 'Ints', 'Floats', 'Bytes', 'Longs']) expected["Floats"] = expected["Floats"].astype(np.float32) for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) with StataReader(self.dta22_118) as rdr: vl = rdr.variable_labels() vl_expected = {u'Unicode_Cities_Strl': u'Here are some strls with Ünicode chars', u'Longs': u'long data', u'Things': u'Here are some things', u'Bytes': u'byte data', u'Ints': u'int data', u'Cities': u'Here are some cities', u'Floats': u'float data'} tm.assert_dict_equal(vl, vl_expected) assert rdr.data_label == u'This is a Ünicode data label' def test_read_write_dta5(self): original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) original.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) def test_write_dta6(self): original = self.read_csv(self.csv3) original.index.name = 'index' original.index = original.index.astype(np.int32) original['year'] = original['year'].astype(np.int32) original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original, check_index_type=False) @pytest.mark.parametrize('version', [114, 117]) def test_read_write_dta10(self, version): original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], columns=['string', 'object', 'integer', 'floating', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' original.index = original.index.astype(np.int32) original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, version=version) written_and_read_again = self.read_dta(path) # original.index is np.int32, read index is np.int64 tm.assert_frame_equal(written_and_read_again.set_index('index'), original, check_index_type=False) def test_stata_doc_examples(self): with tm.ensure_clean() as path: df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path) def test_write_preserves_original(self): # 9795 np.random.seed(423) df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) df.loc[2, 'a':'c'] = np.nan df_copy = df.copy() with tm.ensure_clean() as path: df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) @pytest.mark.parametrize('version', [114, 117]) def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] if compat.PY3: expected = raw.kreis1849[0] assert result == expected assert isinstance(result, compat.string_types) else: expected = raw.kreis1849.str.decode("latin-1")[0] assert result == expected assert isinstance(result, unicode) # noqa with tm.ensure_clean() as path: encoded.to_stata(path, encoding='latin-1', write_index=False, version=version) reread_encoded = read_stata(path, encoding='latin-1') tm.assert_frame_equal(encoded, reread_encoded) def test_read_write_dta11(self): original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) # should get a warning for that format. assert len(w) == 1 written_and_read_again = self.read_dta(path) tm.assert_frame_equal( written_and_read_again.set_index('index'), formatted) @pytest.mark.parametrize('version', [114, 117]) def test_read_write_dta12(self, version): original = DataFrame([(1, 2, 3, 4, 5, 6)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-', 'short', 'delete']) formatted = DataFrame([(1, 2, 3, 4, 5, 6)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_', '_short', '_delete']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always', InvalidColumnName) original.to_stata(path, None, version=version) # should get a warning for that format. assert len(w) == 1 written_and_read_again = self.read_dta(path) tm.assert_frame_equal( written_and_read_again.set_index('index'), formatted) def test_read_write_dta13(self): s1 = Series(2 ** 9, dtype=np.int16) s2 = Series(2 ** 17, dtype=np.int32) s3 = Series(2 ** 33, dtype=np.int64) original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3}) original.index.name = 'index' formatted = original formatted['int64'] = formatted['int64'].astype(np.float64) with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) @pytest.mark.parametrize('version', [114, 117]) @pytest.mark.parametrize( 'file', ['dta14_113', 'dta14_114', 'dta14_115', 'dta14_117']) def test_read_write_reread_dta14(self, file, parsed_114, version): file = getattr(self, file) parsed = self.read_dta(file) parsed.index.name = 'index' expected = self.read_csv(self.csv14) cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] for col in cols: expected[col] = expected[col]._convert(datetime=True, numeric=True) expected['float_'] = expected['float_'].astype(np.float32) expected['date_td'] = pd.to_datetime( expected['date_td'], errors='coerce') tm.assert_frame_equal(parsed_114, parsed) with tm.ensure_clean() as path: parsed_114.to_stata(path, {'date_td': 'td'}, version=version) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( written_and_read_again.set_index('index'), parsed_114) @pytest.mark.parametrize( 'file', ['dta15_113', 'dta15_114', 'dta15_115', 'dta15_117']) def test_read_write_reread_dta15(self, file): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply( datetime.strptime, args=('%Y-%m-%d',)) file = getattr(self, file) parsed = self.read_dta(file) tm.assert_frame_equal(expected, parsed) @pytest.mark.parametrize('version', [114, 117]) def test_timestamp_and_label(self, version): original = DataFrame([(1,)], columns=['variable']) time_stamp = datetime(2000, 2, 29, 14, 21) data_label = 'This is a data file.' with tm.ensure_clean() as path: original.to_stata(path, time_stamp=time_stamp, data_label=data_label, version=version) with StataReader(path) as reader: assert reader.time_stamp == '29 Feb 2000 14:21' assert reader.data_label == data_label @pytest.mark.parametrize('version', [114, 117]) def test_invalid_timestamp(self, version): original = DataFrame([(1,)], columns=['variable']) time_stamp = '01 Jan 2000, 00:00:00' with tm.ensure_clean() as path: with pytest.raises(ValueError): original.to_stata(path, time_stamp=time_stamp, version=version) def test_numeric_column_names(self): original = DataFrame(np.reshape(np.arange(25.0), (5, 5))) original.index.name = 'index' with tm.ensure_clean() as path: # should get a warning for that format. with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index('index') columns = list(written_and_read_again.columns) convert_col_name = lambda x: int(x[1]) written_and_read_again.columns = map(convert_col_name, columns) tm.assert_frame_equal(original, written_and_read_again) @pytest.mark.parametrize('version', [114, 117]) def test_nan_to_missing_value(self, version): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) s1[::2] = np.nan s2[1::2] = np.nan original = DataFrame({'s1': s1, 's2': s2}) original.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path, version=version) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index('index') tm.assert_frame_equal(written_and_read_again, original) def test_no_index(self): columns = ['x', 'y'] original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns) original.index.name = 'index_not_written' with tm.ensure_clean() as path: original.to_stata(path, write_index=False) written_and_read_again = self.read_dta(path) pytest.raises( KeyError, lambda: written_and_read_again['index_not_written']) def test_string_no_dates(self): s1 = Series(['a', 'A longer string']) s2 = Series([1.0, 2.0], dtype=np.float64) original = DataFrame({'s1': s1, 's2': s2}) original.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), original) def test_large_value_conversion(self): s0 = Series([1, 99], dtype=np.int8) s1 = Series([1, 127], dtype=np.int8) s2 = Series([1, 2 ** 15 - 1], dtype=np.int16) s3 = Series([1, 2 ** 63 - 1], dtype=np.int64) original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3}) original.index.name = 'index' with tm.ensure_clean() as path: with tm.assert_produces_warning(PossiblePrecisionLoss): original.to_stata(path) written_and_read_again = self.read_dta(path) modified = original.copy() modified['s1'] = Series(modified['s1'], dtype=np.int16) modified['s2'] = Series(modified['s2'], dtype=np.int32) modified['s3'] = Series(modified['s3'], dtype=np.float64) tm.assert_frame_equal(written_and_read_again.set_index('index'), modified) def test_dates_invalid_column(self): original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)]) original.index.name = 'index' with tm.ensure_clean() as path: with tm.assert_produces_warning(InvalidColumnName): original.to_stata(path, {0: 'tc'}) written_and_read_again = self.read_dta(path) modified = original.copy() modified.columns = ['_0'] tm.assert_frame_equal(written_and_read_again.set_index('index'), modified) def test_105(self): # Data obtained from: # http://go.worldbank.org/ZXY29PVJ21 dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') df = pd.read_stata(dpath) df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] df0 = pd.DataFrame(df0) df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] df0['clustnum'] = df0["clustnum"].astype(np.int16) df0['pri_schl'] = df0["pri_schl"].astype(np.int8) df0['psch_num'] = df0["psch_num"].astype(np.int8) df0['psch_dis'] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0) def test_value_labels_old_format(self): # GH 19417 # # Test that value_labels() returns an empty dict if the file format # predates supporting value labels. dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') reader = StataReader(dpath) assert reader.value_labels() == {} reader.close() def test_date_export_formats(self): columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty'] conversions = {c: c for c in columns} data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns) original = DataFrame([data], columns=columns) original.index.name = 'index' expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time datetime(2006, 11, 20), # Day datetime(2006, 11, 19), # Week datetime(2006, 11, 1), # Month datetime(2006, 10, 1), # Quarter year datetime(2006, 7, 1), # Half year datetime(2006, 1, 1)] # Year expected = DataFrame([expected_values], columns=columns) expected.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path, conversions) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), expected) def test_write_missing_strings(self): original = DataFrame([["1"], [None]], columns=["foo"]) expected = DataFrame([["1"], [""]], columns=["foo"]) expected.index.name = 'index' with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), expected) @pytest.mark.parametrize('version', [114, 117]) @pytest.mark.parametrize('byteorder', ['>', '<']) def test_bool_uint(self, byteorder, version): s0 = Series([0, 1, True], dtype=np.bool) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16) s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16) s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32) s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32) original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3, 's4': s4, 's5': s5, 's6': s6}) original.index.name = 'index' expected = original.copy() expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32, np.int32, np.float64) for c, t in zip(expected.columns, expected_types): expected[c] = expected[c].astype(t) with tm.ensure_clean() as path: original.to_stata(path, byteorder=byteorder, version=version) written_and_read_again = self.read_dta(path) written_and_read_again = written_and_read_again.set_index('index') tm.assert_frame_equal(written_and_read_again, expected) def test_variable_labels(self): with StataReader(self.dta16_115) as rdr: sr_115 = rdr.variable_labels() with StataReader(self.dta16_117) as rdr: sr_117 = rdr.variable_labels() keys = ('var1', 'var2', 'var3') labels = ('label1', 'label2', 'label3') for k, v in compat.iteritems(sr_115): assert k in sr_117 assert v == sr_117[k] assert k in keys assert v in labels def test_minimal_size_col(self): str_lens = (1, 100, 244) s = {} for str_len in str_lens: s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) original = DataFrame(s) with tm.ensure_clean() as path: original.to_stata(path, write_index=False) with StataReader(path) as sr: typlist = sr.typlist variables = sr.varlist formats = sr.fmtlist for variable, fmt, typ in zip(variables, formats, typlist): assert int(variable[1:]) == int(fmt[1:-1]) assert int(variable[1:]) == typ def test_excessively_long_string(self): str_lens = (1, 244, 500) s = {} for str_len in str_lens: s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) original = DataFrame(s) with pytest.raises(ValueError): with tm.ensure_clean() as path: original.to_stata(path) def test_missing_value_generator(self): types = ('b', 'h', 'l') df = DataFrame([[0.0]], columns=['float_']) with tm.ensure_clean() as path: df.to_stata(path) with StataReader(path) as rdr: valid_range = rdr.VALID_RANGE expected_values = ['.' + chr(97 + i) for i in range(26)] expected_values.insert(0, '.') for t in types: offset = valid_range[t][1] for i in range(0, 27): val = StataMissingValue(offset + 1 + i) assert val.string == expected_values[i] # Test extremes for floats val = StataMissingValue(struct.unpack('