# -*- coding: utf-8 -*- from __future__ import print_function import csv import pytest from numpy import nan import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) from pandas.io.common import _get_handle import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, date_range, read_csv, compat, to_datetime) import pandas as pd from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, ensure_clean, makeCustomDataframe as mkdf) import pandas.util.testing as tm from pandas.tests.frame.common import TestData MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64'] MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16', 'int32', 'int64'] class TestDataFrameToCSV(TestData): def read_csv(self, path, **kwargs): params = dict(index_col=0, parse_dates=True) params.update(**kwargs) return pd.read_csv(path, **params) def test_from_csv_deprecation(self): # see gh-17812 with ensure_clean('__tmp_from_csv_deprecation__') as path: self.tsframe.to_csv(path) with tm.assert_produces_warning(FutureWarning): depr_recons = DataFrame.from_csv(path) assert_frame_equal(self.tsframe, depr_recons) def test_to_csv_from_csv1(self): with ensure_clean('__tmp_to_csv_from_csv1__') as path: self.frame['A'][:5] = nan self.frame.to_csv(path) self.frame.to_csv(path, columns=['A', 'B']) self.frame.to_csv(path, header=False) self.frame.to_csv(path, index=False) # test roundtrip self.tsframe.to_csv(path) recons = self.read_csv(path) assert_frame_equal(self.tsframe, recons) self.tsframe.to_csv(path, index_label='index') recons = self.read_csv(path, index_col=None) assert(len(recons.columns) == len(self.tsframe.columns) + 1) # no index self.tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) assert_almost_equal(self.tsframe.values, recons.values) # corner case dm = DataFrame({'s1': Series(lrange(3), lrange(3)), 's2': Series(lrange(2), lrange(2))}) dm.to_csv(path) recons = self.read_csv(path) assert_frame_equal(dm, recons) def test_to_csv_from_csv2(self): with ensure_clean('__tmp_to_csv_from_csv2__') as path: # duplicate index df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], columns=['x', 'y', 'z']) df.to_csv(path) result = self.read_csv(path) assert_frame_equal(result, df) midx = MultiIndex.from_tuples( [('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) df = DataFrame(np.random.randn(3, 3), index=midx, columns=['x', 'y', 'z']) df.to_csv(path) result = self.read_csv(path, index_col=[0, 1, 2], parse_dates=False) assert_frame_equal(result, df, check_names=False) # column aliases col_aliases = Index(['AA', 'X', 'Y', 'Z']) self.frame2.to_csv(path, header=col_aliases) rs = self.read_csv(path) xp = self.frame2.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) pytest.raises(ValueError, self.frame2.to_csv, path, header=['AA', 'X']) def test_to_csv_from_csv3(self): with ensure_clean('__tmp_to_csv_from_csv3__') as path: df1 = DataFrame(np.random.randn(3, 1)) df2 = DataFrame(np.random.randn(3, 1)) df1.to_csv(path) df2.to_csv(path, mode='a', header=False) xp = pd.concat([df1, df2]) rs = pd.read_csv(path, index_col=0) rs.columns = lmap(int, rs.columns) xp.columns = lmap(int, xp.columns) assert_frame_equal(xp, rs) def test_to_csv_from_csv4(self): with ensure_clean('__tmp_to_csv_from_csv4__') as path: # GH 10833 (TimedeltaIndex formatting) dt = pd.Timedelta(seconds=1) df = pd.DataFrame({'dt_data': [i * dt for i in range(3)]}, index=pd.Index([i * dt for i in range(3)], name='dt_index')) df.to_csv(path) result = pd.read_csv(path, index_col='dt_index') result.index = pd.to_timedelta(result.index) # TODO: remove renaming when GH 10875 is solved result.index = result.index.rename('dt_index') result['dt_data'] = pd.to_timedelta(result['dt_data']) assert_frame_equal(df, result, check_index_type=True) def test_to_csv_from_csv5(self): # tz, 8260 with ensure_clean('__tmp_to_csv_from_csv5__') as path: self.tzframe.to_csv(path) result = pd.read_csv(path, index_col=0, parse_dates=['A']) converter = lambda c: to_datetime(result[c]).dt.tz_localize( 'UTC').dt.tz_convert(self.tzframe[c].dt.tz) result['B'] = converter('B') result['C'] = converter('C') assert_frame_equal(result, self.tzframe) def test_to_csv_cols_reordering(self): # GH3454 import pandas as pd chunksize = 5 N = int(chunksize * 2.5) df = mkdf(N, 3) cs = df.columns cols = [cs[2], cs[0]] with ensure_clean() as path: df.to_csv(path, columns=cols, chunksize=chunksize) rs_c = pd.read_csv(path, index_col=0) assert_frame_equal(df[cols], rs_c, check_names=False) def test_to_csv_new_dupe_cols(self): import pandas as pd def _check_df(df, cols=None): with ensure_clean() as path: df.to_csv(path, columns=cols, chunksize=chunksize) rs_c = pd.read_csv(path, index_col=0) # we wrote them in a different order # so compare them in that order if cols is not None: if df.columns.is_unique: rs_c.columns = cols else: indexer, missing = df.columns.get_indexer_non_unique( cols) rs_c.columns = df.columns.take(indexer) for c in cols: obj_df = df[c] obj_rs = rs_c[c] if isinstance(obj_df, Series): assert_series_equal(obj_df, obj_rs) else: assert_frame_equal( obj_df, obj_rs, check_names=False) # wrote in the same order else: rs_c.columns = df.columns assert_frame_equal(df, rs_c, check_names=False) chunksize = 5 N = int(chunksize * 2.5) # dupe cols df = mkdf(N, 3) df.columns = ['a', 'a', 'b'] _check_df(df, None) # dupe cols with selection cols = ['b', 'a'] _check_df(df, cols) @pytest.mark.slow def test_to_csv_dtnat(self): # GH3437 from pandas import NaT def make_dtnat_arr(n, nnat=None): if nnat is None: nnat = int(n * 0.1) # 10% s = list(date_range('2000', freq='5min', periods=n)) if nnat: for i in np.random.randint(0, len(s), nnat): s[i] = NaT i = np.random.randint(100) s[-i] = NaT s[i] = NaT return s chunksize = 1000 # N=35000 s1 = make_dtnat_arr(chunksize + 5) s2 = make_dtnat_arr(chunksize + 5, 0) # s3=make_dtnjat_arr(chunksize+5,0) with ensure_clean('1.csv') as pth: df = DataFrame(dict(a=s1, b=s2)) df.to_csv(pth, chunksize=chunksize) recons = self.read_csv(pth)._convert(datetime=True, coerce=True) assert_frame_equal(df, recons, check_names=False, check_less_precise=True) @pytest.mark.slow def test_to_csv_moar(self): def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = self.read_csv(path, **kwargs) else: kwargs['header'] = 0 with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = self.read_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [recons.iloc[ :, i].values for i in range(rnlvl - 1)] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array( lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array( list(map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array( list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array( lmap(Timestamp, to_datetime(recons.columns)), dtype=c_dtype) df.columns = np.array( lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True) N = 100 chunksize = 1000 for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), 'dt', 's') for ncols in [4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type='dt', c_idx_type='s'), 'dt', 's') pass for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'), ('p', 'p')]: for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type, c_idx_type=c_idx_type), r_idx_type, c_idx_type) for ncols in [1, 2, 3, 4]: base = int((chunksize // ncols or 1) or 1) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols)) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]: df = mkdf(nrows, 3) cols = list(df.columns) cols[:2] = ["dupe", "dupe"] cols[-2:] = ["dupe", "dupe"] ix = list(df.index) ix[:2] = ["rdupe", "rdupe"] ix[-2:] = ["rdupe", "rdupe"] df.index = ix df.columns = cols _do_test(df, dupe_col=True) _do_test(DataFrame(index=lrange(10))) _do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2) for ncols in [2, 3, 4]: base = int(chunksize // ncols) for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2, 2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2, base - 1, base, base + 1]: _do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2) _do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2) _do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2), rnlvl=2, cnlvl=2) def test_to_csv_from_csv_w_some_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix self.frame['G'] = np.nan f = lambda x: [np.inf, np.nan][np.random.rand() < .5] self.frame['H'] = self.frame.index.map(f) with ensure_clean() as path: self.frame.to_csv(path) recons = self.read_csv(path) # TODO to_csv drops column name assert_frame_equal(self.frame, recons, check_names=False) assert_frame_equal(np.isinf(self.frame), np.isinf(recons), check_names=False) def test_to_csv_from_csv_w_all_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix self.frame['E'] = np.inf self.frame['F'] = -np.inf with ensure_clean() as path: self.frame.to_csv(path) recons = self.read_csv(path) # TODO to_csv drops column name assert_frame_equal(self.frame, recons, check_names=False) assert_frame_equal(np.isinf(self.frame), np.isinf(recons), check_names=False) def test_to_csv_no_index(self): # GH 3624, after appending columns, to_csv fails with ensure_clean('__tmp_to_csv_no_index__') as path: df = DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6]}) df.to_csv(path, index=False) result = read_csv(path) assert_frame_equal(df, result) df['c3'] = Series([7, 8, 9], dtype='int64') df.to_csv(path, index=False) result = read_csv(path) assert_frame_equal(df, result) def test_to_csv_with_mix_columns(self): # gh-11637: incorrect output when a mix of integer and string column # names passed as columns parameter in to_csv df = DataFrame({0: ['a', 'b', 'c'], 1: ['aa', 'bb', 'cc']}) df['test'] = 'txt' assert df.to_csv() == df.to_csv(columns=[0, 1, 'test']) def test_to_csv_headers(self): # GH6186, the presence or absence of `index` incorrectly # causes to_csv to have different header semantics. from_df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) to_df = DataFrame([[1, 2], [3, 4]], columns=['X', 'Y']) with ensure_clean('__tmp_to_csv_headers__') as path: from_df.to_csv(path, header=['X', 'Y']) recons = self.read_csv(path) assert_frame_equal(to_df, recons) from_df.to_csv(path, index=False, header=['X', 'Y']) recons = self.read_csv(path) recons.reset_index(inplace=True) assert_frame_equal(to_df, recons) def test_to_csv_multiindex(self): frame = self.frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) frame.index = new_index with ensure_clean('__tmp_to_csv_multiindex__') as path: frame.to_csv(path, header=False) frame.to_csv(path, columns=['A', 'B']) # round trip frame.to_csv(path) df = self.read_csv(path, index_col=[0, 1], parse_dates=False) # TODO to_csv drops column name assert_frame_equal(frame, df, check_names=False) assert frame.index.names == df.index.names # needed if setUp becomes a class method self.frame.index = old_index # try multiindex with dates tsframe = self.tsframe old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.to_csv(path, index_label=['time', 'foo']) recons = self.read_csv(path, index_col=[0, 1]) # TODO to_csv drops column name assert_frame_equal(tsframe, recons, check_names=False) # do not load index tsframe.to_csv(path) recons = self.read_csv(path, index_col=None) assert len(recons.columns) == len(tsframe.columns) + 2 # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) assert_almost_equal(recons.values, self.tsframe.values) # needed if setUp becomes class method self.tsframe.index = old_index with ensure_clean('__tmp_to_csv_multiindex__') as path: # GH3571, GH1651, GH3141 def _make_frame(names=None): if names is True: names = ['first', 'second'] return DataFrame(np.random.randint(0, 10, size=(3, 3)), columns=MultiIndex.from_tuples( [('bah', 'foo'), ('bah', 'bar'), ('ban', 'baz')], names=names), dtype='int64') # column & index are multi-index df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1]) assert_frame_equal(df, result) # column is mi df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4) df.to_csv(path) result = read_csv( path, header=[0, 1, 2, 3], index_col=0) assert_frame_equal(df, result) # dup column names? df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4) df.to_csv(path) result = read_csv(path, header=[0, 1, 2, 3], index_col=[0, 1, 2]) assert_frame_equal(df, result) # writing with no index df = _make_frame() df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert_frame_equal(df, result) # we lose the names here df = _make_frame(True) df.to_csv(path, index=False) result = read_csv(path, header=[0, 1]) assert com._all_none(*result.columns.names) result.columns.names = df.columns.names assert_frame_equal(df, result) # tupleize_cols=True and index=False df = _make_frame(True) with tm.assert_produces_warning(FutureWarning): df.to_csv(path, tupleize_cols=True, index=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = read_csv(path, header=0, tupleize_cols=True, index_col=None) result.columns = df.columns assert_frame_equal(df, result) # whatsnew example df = _make_frame() df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) df = _make_frame(True) df.to_csv(path) result = read_csv(path, header=[0, 1], index_col=[0]) assert_frame_equal(df, result) # column & index are multi-index (compatibility) df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) with tm.assert_produces_warning(FutureWarning): df.to_csv(path, tupleize_cols=True) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = read_csv(path, header=0, index_col=[0, 1], tupleize_cols=True) result.columns = df.columns assert_frame_equal(df, result) # invalid options df = _make_frame(True) df.to_csv(path) for i in [6, 7]: msg = 'len of {i}, but only 5 lines in file'.format(i=i) with tm.assert_raises_regex(ParserError, msg): read_csv(path, header=lrange(i), index_col=0) # write with cols with tm.assert_raises_regex(TypeError, 'cannot specify cols ' 'with a MultiIndex'): df.to_csv(path, columns=['foo', 'bar']) with ensure_clean('__tmp_to_csv_multiindex__') as path: # empty tsframe[:0].to_csv(path) recons = self.read_csv(path) exp = tsframe[:0] exp.index = [] tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan with ensure_clean('__tmp_to_csv_float32_nanrep__.csv') as path: df.to_csv(path, na_rep=999) with open(path) as f: lines = f.readlines() assert lines[1].split(',')[2] == '999' def test_to_csv_withcommas(self): # Commas inside fields should be correctly escaped when saving as CSV. df = DataFrame({'A': [1, 2, 3], 'B': ['5,6', '7,8', '9,0']}) with ensure_clean('__tmp_to_csv_withcommas__.csv') as path: df.to_csv(path) df2 = self.read_csv(path) assert_frame_equal(df2, df) def test_to_csv_mixed(self): def create_cols(name): return ["%s%03d" % (name, i) for i in range(5)] df_float = DataFrame(np.random.randn( 100, 5), dtype='float64', columns=create_cols('float')) df_int = DataFrame(np.random.randn(100, 5), dtype='int64', columns=create_cols('int')) df_bool = DataFrame(True, index=df_float.index, columns=create_cols('bool')) df_object = DataFrame('foo', index=df_float.index, columns=create_cols('object')) df_dt = DataFrame(Timestamp('20010101'), index=df_float.index, columns=create_cols('date')) # add in some nans df_float.loc[30:50, 1:3] = np.nan # ## this is a bug in read_csv right now #### # df_dt.loc[30:50,1:3] = np.nan df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1) # dtype dtypes = dict() for n, dtype in [('float', np.float64), ('int', np.int64), ('bool', np.bool), ('object', np.object)]: for c in create_cols(n): dtypes[c] = dtype with ensure_clean() as filename: df.to_csv(filename) rs = read_csv(filename, index_col=0, dtype=dtypes, parse_dates=create_cols('date')) assert_frame_equal(rs, df) def test_to_csv_dups_cols(self): df = DataFrame(np.random.randn(1000, 30), columns=lrange( 15) + lrange(15), dtype='float64') with ensure_clean() as filename: df.to_csv(filename) # single dtype, fine result = read_csv(filename, index_col=0) result.columns = df.columns assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype='float64') df_int = DataFrame(np.random.randn(1000, 3), dtype='int64') df_bool = DataFrame(True, index=df_float.index, columns=lrange(3)) df_object = DataFrame('foo', index=df_float.index, columns=lrange(3)) df_dt = DataFrame(Timestamp('20010101'), index=df_float.index, columns=lrange(3)) df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True) cols = [] for i in range(5): cols.extend([0, 1, 2]) df.columns = cols with ensure_clean() as filename: df.to_csv(filename) result = read_csv(filename, index_col=0) # date cols for i in ['0.4', '1.4', '2.4']: result[i] = to_datetime(result[i]) result.columns = df.columns assert_frame_equal(result, df) # GH3457 from pandas.util.testing import makeCustomDataframe as mkdf N = 10 df = mkdf(N, 3) df.columns = ['a', 'a', 'b'] with ensure_clean() as filename: df.to_csv(filename) # read_csv will rename the dups columns result = read_csv(filename, index_col=0) result = result.rename(columns={'a.1': 'a'}) assert_frame_equal(result, df) def test_to_csv_chunking(self): aa = DataFrame({'A': lrange(100000)}) aa['B'] = aa.A + 1.0 aa['C'] = aa.A + 2.0 aa['D'] = aa.A + 3.0 for chunksize in [10000, 50000, 100000]: with ensure_clean() as filename: aa.to_csv(filename, chunksize=chunksize) rs = read_csv(filename, index_col=0) assert_frame_equal(rs, aa) @pytest.mark.slow def test_to_csv_wide_frame_formatting(self): # Issue #8621 df = DataFrame(np.random.randn(1, 100010), columns=None, index=None) with ensure_clean() as filename: df.to_csv(filename, header=False, index=False) rs = read_csv(filename, header=None) assert_frame_equal(rs, df) def test_to_csv_bug(self): f1 = StringIO('a,1.0\nb,2.0') df = self.read_csv(f1, header=None) newdf = DataFrame({'t': df[df.columns[0]]}) with ensure_clean() as path: newdf.to_csv(path) recons = read_csv(path, index_col=0) # don't check_names as t != 1 assert_frame_equal(recons, newdf, check_names=False) def test_to_csv_unicode(self): df = DataFrame({u('c/\u03c3'): [1, 2, 3]}) with ensure_clean() as path: df.to_csv(path, encoding='UTF-8') df2 = read_csv(path, index_col=0, encoding='UTF-8') assert_frame_equal(df, df2) df.to_csv(path, encoding='UTF-8', index=False) df2 = read_csv(path, index_col=None, encoding='UTF-8') assert_frame_equal(df, df2) def test_to_csv_unicode_index_col(self): buf = StringIO('') df = DataFrame( [[u("\u05d0"), "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]], columns=[u("\u05d0"), u("\u05d1"), u("\u05d2"), u("\u05d3")], index=[u("\u05d0"), u("\u05d1")]) df.to_csv(buf, encoding='UTF-8') buf.seek(0) df2 = read_csv(buf, index_col=0, encoding='UTF-8') assert_frame_equal(df, df2) def test_to_csv_stringio(self): buf = StringIO() self.frame.to_csv(buf) buf.seek(0) recons = read_csv(buf, index_col=0) # TODO to_csv drops column name assert_frame_equal(recons, self.frame, check_names=False) def test_to_csv_float_format(self): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']) with ensure_clean() as filename: df.to_csv(filename, float_format='%.2f') rs = read_csv(filename, index_col=0) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=['A', 'B'], columns=['X', 'Y', 'Z']) assert_frame_equal(rs, xp) def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) buf = StringIO() df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC, encoding='utf-8') result = buf.getvalue() expected = ('"A","B"\n' '1,"foo"\n' '2,"bar"\n' '3,"baz"\n') assert result == expected def test_to_csv_quote_none(self): # GH4328 df = DataFrame({'A': ['hello', '{"hello"}']}) for encoding in (None, 'utf-8'): buf = StringIO() df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) result = buf.getvalue() expected = 'A\nhello\n{"hello"}\n' assert result == expected def test_to_csv_index_no_leading_comma(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, index_label=False) expected = ('A,B\n' 'one,1,4\n' 'two,2,5\n' 'three,3,6\n') assert buf.getvalue() == expected def test_to_csv_line_terminators(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) buf = StringIO() df.to_csv(buf, line_terminator='\r\n') expected = (',A,B\r\n' 'one,1,4\r\n' 'two,2,5\r\n' 'three,3,6\r\n') assert buf.getvalue() == expected buf = StringIO() df.to_csv(buf) # The default line terminator remains \n expected = (',A,B\n' 'one,1,4\n' 'two,2,5\n' 'three,3,6\n') assert buf.getvalue() == expected def test_to_csv_from_csv_categorical(self): # CSV with categoricals should result in the same output as when one # would add a "normal" Series/DataFrame. s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) res = StringIO() s.to_csv(res) exp = StringIO() s2.to_csv(exp) assert res.getvalue() == exp.getvalue() df = DataFrame({"s": s}) df2 = DataFrame({"s": s2}) res = StringIO() df.to_csv(res) exp = StringIO() df2.to_csv(exp) assert res.getvalue() == exp.getvalue() def test_to_csv_path_is_none(self): # GH 8215 # Make sure we return string for consistency with # Series.to_csv() csv_str = self.frame.to_csv(path_or_buf=None) assert isinstance(csv_str, str) recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) @pytest.mark.parametrize('df,encoding', [ (DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=['A', 'B'], columns=['X', 'Y', 'Z']), None), # GH 21241, 21118 (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), (DataFrame(5 * [[123, u"你好", u"世界"]], columns=['X', 'Y', 'Z']), 'gb2312'), (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]], columns=['X', 'Y', 'Z']), 'cp737') ]) def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv result = read_csv(filename, compression=compression, index_col=0, encoding=encoding) assert_frame_equal(df, result) # test the round trip using file handle - to_csv -> read_csv f, _handles = _get_handle(filename, 'w', compression=compression, encoding=encoding) with f: df.to_csv(f, encoding=encoding) result = pd.read_csv(filename, compression=compression, encoding=encoding, index_col=0, squeeze=True) assert_frame_equal(df, result) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: text = fh.read().decode(encoding or 'utf8') for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: dt_index = self.tsframe.index datetime_frame = DataFrame( {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) datetime_frame.to_csv(path, date_format='%Y%m%d') # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_int = datetime_frame.applymap( lambda x: int(x.strftime('%Y%m%d'))) datetime_frame_int.index = datetime_frame_int.index.map( lambda x: int(x.strftime('%Y%m%d'))) assert_frame_equal(test, datetime_frame_int) datetime_frame.to_csv(path, date_format='%Y-%m-%d') # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_str = datetime_frame.applymap( lambda x: x.strftime('%Y-%m-%d')) datetime_frame_str.index = datetime_frame_str.index.map( lambda x: x.strftime('%Y-%m-%d')) assert_frame_equal(test, datetime_frame_str) # Check that columns get converted datetime_frame_columns = datetime_frame.T datetime_frame_columns.to_csv(path, date_format='%Y%m%d') test = read_csv(path, index_col=0) datetime_frame_columns = datetime_frame_columns.applymap( lambda x: int(x.strftime('%Y%m%d'))) # Columns don't get converted to ints by read_csv datetime_frame_columns.columns = ( datetime_frame_columns.columns .map(lambda x: x.strftime('%Y%m%d'))) assert_frame_equal(test, datetime_frame_columns) # test NaTs nat_index = to_datetime( ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) nat_frame = DataFrame({'A': nat_index}, index=nat_index) nat_frame.to_csv(path, date_format='%Y-%m-%d') test = read_csv(path, parse_dates=[0, 1], index_col=0) assert_frame_equal(test, nat_frame) def test_to_csv_with_dst_transitions(self): with ensure_clean('csv_date_format_with_dst') as path: # make sure we are not failing on transitions times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H", ambiguous='infer') for i in [times, times + pd.Timedelta('10s')]: time_range = np.array(range(len(i)), dtype='int64') df = DataFrame({'A': time_range}, index=i) df.to_csv(path, index=True) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/London') assert_frame_equal(result, df) # GH11619 idx = pd.date_range('2015-01-01', '2015-12-31', freq='H', tz='Europe/Paris') df = DataFrame({'values': 1, 'idx': idx}, index=idx) with ensure_clean('csv_date_format_with_dst') as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/Paris') result['idx'] = to_datetime(result['idx']).astype( 'datetime64[ns, Europe/Paris]') assert_frame_equal(result, df) # assert working df.astype(str) with ensure_clean('csv_date_format_with_dst') as path: df.to_pickle(path) result = pd.read_pickle(path) assert_frame_equal(result, df) def test_to_csv_quoting(self): df = DataFrame({ 'c_bool': [True, False], 'c_float': [1.0, 3.2], 'c_int': [42, np.nan], 'c_string': ['a', 'b,c'], }) expected = """\ ,c_bool,c_float,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,"b,c" """ result = df.to_csv() assert result == expected result = df.to_csv(quoting=None) assert result == expected result = df.to_csv(quoting=csv.QUOTE_MINIMAL) assert result == expected expected = """\ "","c_bool","c_float","c_int","c_string" "0","True","1.0","42.0","a" "1","False","3.2","","b,c" """ result = df.to_csv(quoting=csv.QUOTE_ALL) assert result == expected # see gh-12922, gh-13259: make sure changes to # the formatters do not break this behaviour expected = """\ "","c_bool","c_float","c_int","c_string" 0,True,1.0,42.0,"a" 1,False,3.2,"","b,c" """ result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) assert result == expected msg = "need to escape, but no escapechar set" tm.assert_raises_regex(csv.Error, msg, df.to_csv, quoting=csv.QUOTE_NONE) tm.assert_raises_regex(csv.Error, msg, df.to_csv, quoting=csv.QUOTE_NONE, escapechar=None) expected = """\ ,c_bool,c_float,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,b!,c """ result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='!') assert result == expected expected = """\ ,c_bool,c_ffloat,c_int,c_string 0,True,1.0,42.0,a 1,False,3.2,,bf,c """ result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='f') assert result == expected # see gh-3503: quoting Windows line terminators # presents with encoding? text = 'a,b,c\n1,"test \r\n",3\n' df = pd.read_csv(StringIO(text)) buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) assert buf.getvalue() == text # xref gh-7791: make sure the quoting parameter is passed through # with multi-indexes df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' assert df.to_csv(quoting=csv.QUOTE_ALL) == expected def test_period_index_date_overflow(self): # see gh-15982 dates = ["1990-01-01", "2000-01-01", "3005-01-01"] index = pd.PeriodIndex(dates, freq="D") df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() expected = ',0\n1990-01-01,4\n2000-01-01,5\n3005-01-01,6\n' assert result == expected date_format = "%m-%d-%Y" result = df.to_csv(date_format=date_format) expected = ',0\n01-01-1990,4\n01-01-2000,5\n01-01-3005,6\n' assert result == expected # Overflow with pd.NaT dates = ["1990-01-01", pd.NaT, "3005-01-01"] index = pd.PeriodIndex(dates, freq="D") df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n' assert result == expected def test_multi_index_header(self): # see gh-5539 columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) df.columns = columns header = ["a", "b", "c", "d"] result = df.to_csv(header=header) expected = ",a,b,c,d\n0,1,2,3,4\n1,5,6,7,8\n" assert result == expected