laywerrobot/lib/python3.6/site-packages/pandas/tests/frame/test_to_csv.py
2020-08-27 21:55:39 +02:00

1179 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
from __future__ import print_function
import csv
import pytest
from numpy import nan
import numpy as np
from pandas.compat import (lmap, range, lrange, StringIO, u)
from pandas.io.common import _get_handle
import pandas.core.common as com
from pandas.errors import ParserError
from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp,
date_range, read_csv, compat, to_datetime)
import pandas as pd
from pandas.util.testing import (assert_almost_equal,
assert_series_equal,
assert_frame_equal,
ensure_clean,
makeCustomDataframe as mkdf)
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
MIXED_FLOAT_DTYPES = ['float16', 'float32', 'float64']
MIXED_INT_DTYPES = ['uint8', 'uint16', 'uint32', 'uint64', 'int8', 'int16',
'int32', 'int64']
class TestDataFrameToCSV(TestData):
def read_csv(self, path, **kwargs):
params = dict(index_col=0, parse_dates=True)
params.update(**kwargs)
return pd.read_csv(path, **params)
def test_from_csv_deprecation(self):
# see gh-17812
with ensure_clean('__tmp_from_csv_deprecation__') as path:
self.tsframe.to_csv(path)
with tm.assert_produces_warning(FutureWarning):
depr_recons = DataFrame.from_csv(path)
assert_frame_equal(self.tsframe, depr_recons)
def test_to_csv_from_csv1(self):
with ensure_clean('__tmp_to_csv_from_csv1__') as path:
self.frame['A'][:5] = nan
self.frame.to_csv(path)
self.frame.to_csv(path, columns=['A', 'B'])
self.frame.to_csv(path, header=False)
self.frame.to_csv(path, index=False)
# test roundtrip
self.tsframe.to_csv(path)
recons = self.read_csv(path)
assert_frame_equal(self.tsframe, recons)
self.tsframe.to_csv(path, index_label='index')
recons = self.read_csv(path, index_col=None)
assert(len(recons.columns) == len(self.tsframe.columns) + 1)
# no index
self.tsframe.to_csv(path, index=False)
recons = self.read_csv(path, index_col=None)
assert_almost_equal(self.tsframe.values, recons.values)
# corner case
dm = DataFrame({'s1': Series(lrange(3), lrange(3)),
's2': Series(lrange(2), lrange(2))})
dm.to_csv(path)
recons = self.read_csv(path)
assert_frame_equal(dm, recons)
def test_to_csv_from_csv2(self):
with ensure_clean('__tmp_to_csv_from_csv2__') as path:
# duplicate index
df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'],
columns=['x', 'y', 'z'])
df.to_csv(path)
result = self.read_csv(path)
assert_frame_equal(result, df)
midx = MultiIndex.from_tuples(
[('A', 1, 2), ('A', 1, 2), ('B', 1, 2)])
df = DataFrame(np.random.randn(3, 3), index=midx,
columns=['x', 'y', 'z'])
df.to_csv(path)
result = self.read_csv(path, index_col=[0, 1, 2],
parse_dates=False)
assert_frame_equal(result, df, check_names=False)
# column aliases
col_aliases = Index(['AA', 'X', 'Y', 'Z'])
self.frame2.to_csv(path, header=col_aliases)
rs = self.read_csv(path)
xp = self.frame2.copy()
xp.columns = col_aliases
assert_frame_equal(xp, rs)
pytest.raises(ValueError, self.frame2.to_csv, path,
header=['AA', 'X'])
def test_to_csv_from_csv3(self):
with ensure_clean('__tmp_to_csv_from_csv3__') as path:
df1 = DataFrame(np.random.randn(3, 1))
df2 = DataFrame(np.random.randn(3, 1))
df1.to_csv(path)
df2.to_csv(path, mode='a', header=False)
xp = pd.concat([df1, df2])
rs = pd.read_csv(path, index_col=0)
rs.columns = lmap(int, rs.columns)
xp.columns = lmap(int, xp.columns)
assert_frame_equal(xp, rs)
def test_to_csv_from_csv4(self):
with ensure_clean('__tmp_to_csv_from_csv4__') as path:
# GH 10833 (TimedeltaIndex formatting)
dt = pd.Timedelta(seconds=1)
df = pd.DataFrame({'dt_data': [i * dt for i in range(3)]},
index=pd.Index([i * dt for i in range(3)],
name='dt_index'))
df.to_csv(path)
result = pd.read_csv(path, index_col='dt_index')
result.index = pd.to_timedelta(result.index)
# TODO: remove renaming when GH 10875 is solved
result.index = result.index.rename('dt_index')
result['dt_data'] = pd.to_timedelta(result['dt_data'])
assert_frame_equal(df, result, check_index_type=True)
def test_to_csv_from_csv5(self):
# tz, 8260
with ensure_clean('__tmp_to_csv_from_csv5__') as path:
self.tzframe.to_csv(path)
result = pd.read_csv(path, index_col=0, parse_dates=['A'])
converter = lambda c: to_datetime(result[c]).dt.tz_localize(
'UTC').dt.tz_convert(self.tzframe[c].dt.tz)
result['B'] = converter('B')
result['C'] = converter('C')
assert_frame_equal(result, self.tzframe)
def test_to_csv_cols_reordering(self):
# GH3454
import pandas as pd
chunksize = 5
N = int(chunksize * 2.5)
df = mkdf(N, 3)
cs = df.columns
cols = [cs[2], cs[0]]
with ensure_clean() as path:
df.to_csv(path, columns=cols, chunksize=chunksize)
rs_c = pd.read_csv(path, index_col=0)
assert_frame_equal(df[cols], rs_c, check_names=False)
def test_to_csv_new_dupe_cols(self):
import pandas as pd
def _check_df(df, cols=None):
with ensure_clean() as path:
df.to_csv(path, columns=cols, chunksize=chunksize)
rs_c = pd.read_csv(path, index_col=0)
# we wrote them in a different order
# so compare them in that order
if cols is not None:
if df.columns.is_unique:
rs_c.columns = cols
else:
indexer, missing = df.columns.get_indexer_non_unique(
cols)
rs_c.columns = df.columns.take(indexer)
for c in cols:
obj_df = df[c]
obj_rs = rs_c[c]
if isinstance(obj_df, Series):
assert_series_equal(obj_df, obj_rs)
else:
assert_frame_equal(
obj_df, obj_rs, check_names=False)
# wrote in the same order
else:
rs_c.columns = df.columns
assert_frame_equal(df, rs_c, check_names=False)
chunksize = 5
N = int(chunksize * 2.5)
# dupe cols
df = mkdf(N, 3)
df.columns = ['a', 'a', 'b']
_check_df(df, None)
# dupe cols with selection
cols = ['b', 'a']
_check_df(df, cols)
@pytest.mark.slow
def test_to_csv_dtnat(self):
# GH3437
from pandas import NaT
def make_dtnat_arr(n, nnat=None):
if nnat is None:
nnat = int(n * 0.1) # 10%
s = list(date_range('2000', freq='5min', periods=n))
if nnat:
for i in np.random.randint(0, len(s), nnat):
s[i] = NaT
i = np.random.randint(100)
s[-i] = NaT
s[i] = NaT
return s
chunksize = 1000
# N=35000
s1 = make_dtnat_arr(chunksize + 5)
s2 = make_dtnat_arr(chunksize + 5, 0)
# s3=make_dtnjat_arr(chunksize+5,0)
with ensure_clean('1.csv') as pth:
df = DataFrame(dict(a=s1, b=s2))
df.to_csv(pth, chunksize=chunksize)
recons = self.read_csv(pth)._convert(datetime=True,
coerce=True)
assert_frame_equal(df, recons, check_names=False,
check_less_precise=True)
@pytest.mark.slow
def test_to_csv_moar(self):
def _do_test(df, r_dtype=None, c_dtype=None,
rnlvl=None, cnlvl=None, dupe_col=False):
kwargs = dict(parse_dates=False)
if cnlvl:
if rnlvl is not None:
kwargs['index_col'] = lrange(rnlvl)
kwargs['header'] = lrange(cnlvl)
with ensure_clean('__tmp_to_csv_moar__') as path:
df.to_csv(path, encoding='utf8',
chunksize=chunksize)
recons = self.read_csv(path, **kwargs)
else:
kwargs['header'] = 0
with ensure_clean('__tmp_to_csv_moar__') as path:
df.to_csv(path, encoding='utf8', chunksize=chunksize)
recons = self.read_csv(path, **kwargs)
def _to_uni(x):
if not isinstance(x, compat.text_type):
return x.decode('utf8')
return x
if dupe_col:
# read_Csv disambiguates the columns by
# labeling them dupe.1,dupe.2, etc'. monkey patch columns
recons.columns = df.columns
if rnlvl and not cnlvl:
delta_lvl = [recons.iloc[
:, i].values for i in range(rnlvl - 1)]
ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl)
recons.index = ix
recons = recons.iloc[:, rnlvl - 1:]
type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O')
if r_dtype:
if r_dtype == 'u': # unicode
r_dtype = 'O'
recons.index = np.array(lmap(_to_uni, recons.index),
dtype=r_dtype)
df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype)
elif r_dtype == 'dt': # unicode
r_dtype = 'O'
recons.index = np.array(lmap(Timestamp, recons.index),
dtype=r_dtype)
df.index = np.array(
lmap(Timestamp, df.index), dtype=r_dtype)
elif r_dtype == 'p':
r_dtype = 'O'
recons.index = np.array(
list(map(Timestamp, to_datetime(recons.index))),
dtype=r_dtype)
df.index = np.array(
list(map(Timestamp, df.index.to_timestamp())),
dtype=r_dtype)
else:
r_dtype = type_map.get(r_dtype)
recons.index = np.array(recons.index, dtype=r_dtype)
df.index = np.array(df.index, dtype=r_dtype)
if c_dtype:
if c_dtype == 'u':
c_dtype = 'O'
recons.columns = np.array(lmap(_to_uni, recons.columns),
dtype=c_dtype)
df.columns = np.array(
lmap(_to_uni, df.columns), dtype=c_dtype)
elif c_dtype == 'dt':
c_dtype = 'O'
recons.columns = np.array(lmap(Timestamp, recons.columns),
dtype=c_dtype)
df.columns = np.array(
lmap(Timestamp, df.columns), dtype=c_dtype)
elif c_dtype == 'p':
c_dtype = 'O'
recons.columns = np.array(
lmap(Timestamp, to_datetime(recons.columns)),
dtype=c_dtype)
df.columns = np.array(
lmap(Timestamp, df.columns.to_timestamp()),
dtype=c_dtype)
else:
c_dtype = type_map.get(c_dtype)
recons.columns = np.array(recons.columns, dtype=c_dtype)
df.columns = np.array(df.columns, dtype=c_dtype)
assert_frame_equal(df, recons, check_names=False,
check_less_precise=True)
N = 100
chunksize = 1000
for ncols in [4]:
base = int((chunksize // ncols or 1) or 1)
for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
base - 1, base, base + 1]:
_do_test(mkdf(nrows, ncols, r_idx_type='dt',
c_idx_type='s'), 'dt', 's')
for ncols in [4]:
base = int((chunksize // ncols or 1) or 1)
for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
base - 1, base, base + 1]:
_do_test(mkdf(nrows, ncols, r_idx_type='dt',
c_idx_type='s'), 'dt', 's')
pass
for r_idx_type, c_idx_type in [('i', 'i'), ('s', 's'), ('u', 'dt'),
('p', 'p')]:
for ncols in [1, 2, 3, 4]:
base = int((chunksize // ncols or 1) or 1)
for nrows in [2, 10, N - 1, N, N + 1, N + 2, 2 * N - 2,
2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
base - 1, base, base + 1]:
_do_test(mkdf(nrows, ncols, r_idx_type=r_idx_type,
c_idx_type=c_idx_type),
r_idx_type, c_idx_type)
for ncols in [1, 2, 3, 4]:
base = int((chunksize // ncols or 1) or 1)
for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
base - 1, base, base + 1]:
_do_test(mkdf(nrows, ncols))
for nrows in [10, N - 2, N - 1, N, N + 1, N + 2]:
df = mkdf(nrows, 3)
cols = list(df.columns)
cols[:2] = ["dupe", "dupe"]
cols[-2:] = ["dupe", "dupe"]
ix = list(df.index)
ix[:2] = ["rdupe", "rdupe"]
ix[-2:] = ["rdupe", "rdupe"]
df.index = ix
df.columns = cols
_do_test(df, dupe_col=True)
_do_test(DataFrame(index=lrange(10)))
_do_test(mkdf(chunksize // 2 + 1, 2, r_idx_nlevels=2), rnlvl=2)
for ncols in [2, 3, 4]:
base = int(chunksize // ncols)
for nrows in [10, N - 2, N - 1, N, N + 1, N + 2, 2 * N - 2,
2 * N - 1, 2 * N, 2 * N + 1, 2 * N + 2,
base - 1, base, base + 1]:
_do_test(mkdf(nrows, ncols, r_idx_nlevels=2), rnlvl=2)
_do_test(mkdf(nrows, ncols, c_idx_nlevels=2), cnlvl=2)
_do_test(mkdf(nrows, ncols, r_idx_nlevels=2, c_idx_nlevels=2),
rnlvl=2, cnlvl=2)
def test_to_csv_from_csv_w_some_infs(self):
# test roundtrip with inf, -inf, nan, as full columns and mix
self.frame['G'] = np.nan
f = lambda x: [np.inf, np.nan][np.random.rand() < .5]
self.frame['H'] = self.frame.index.map(f)
with ensure_clean() as path:
self.frame.to_csv(path)
recons = self.read_csv(path)
# TODO to_csv drops column name
assert_frame_equal(self.frame, recons, check_names=False)
assert_frame_equal(np.isinf(self.frame),
np.isinf(recons), check_names=False)
def test_to_csv_from_csv_w_all_infs(self):
# test roundtrip with inf, -inf, nan, as full columns and mix
self.frame['E'] = np.inf
self.frame['F'] = -np.inf
with ensure_clean() as path:
self.frame.to_csv(path)
recons = self.read_csv(path)
# TODO to_csv drops column name
assert_frame_equal(self.frame, recons, check_names=False)
assert_frame_equal(np.isinf(self.frame),
np.isinf(recons), check_names=False)
def test_to_csv_no_index(self):
# GH 3624, after appending columns, to_csv fails
with ensure_clean('__tmp_to_csv_no_index__') as path:
df = DataFrame({'c1': [1, 2, 3], 'c2': [4, 5, 6]})
df.to_csv(path, index=False)
result = read_csv(path)
assert_frame_equal(df, result)
df['c3'] = Series([7, 8, 9], dtype='int64')
df.to_csv(path, index=False)
result = read_csv(path)
assert_frame_equal(df, result)
def test_to_csv_with_mix_columns(self):
# gh-11637: incorrect output when a mix of integer and string column
# names passed as columns parameter in to_csv
df = DataFrame({0: ['a', 'b', 'c'],
1: ['aa', 'bb', 'cc']})
df['test'] = 'txt'
assert df.to_csv() == df.to_csv(columns=[0, 1, 'test'])
def test_to_csv_headers(self):
# GH6186, the presence or absence of `index` incorrectly
# causes to_csv to have different header semantics.
from_df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
to_df = DataFrame([[1, 2], [3, 4]], columns=['X', 'Y'])
with ensure_clean('__tmp_to_csv_headers__') as path:
from_df.to_csv(path, header=['X', 'Y'])
recons = self.read_csv(path)
assert_frame_equal(to_df, recons)
from_df.to_csv(path, index=False, header=['X', 'Y'])
recons = self.read_csv(path)
recons.reset_index(inplace=True)
assert_frame_equal(to_df, recons)
def test_to_csv_multiindex(self):
frame = self.frame
old_index = frame.index
arrays = np.arange(len(old_index) * 2).reshape(2, -1)
new_index = MultiIndex.from_arrays(arrays, names=['first', 'second'])
frame.index = new_index
with ensure_clean('__tmp_to_csv_multiindex__') as path:
frame.to_csv(path, header=False)
frame.to_csv(path, columns=['A', 'B'])
# round trip
frame.to_csv(path)
df = self.read_csv(path, index_col=[0, 1],
parse_dates=False)
# TODO to_csv drops column name
assert_frame_equal(frame, df, check_names=False)
assert frame.index.names == df.index.names
# needed if setUp becomes a class method
self.frame.index = old_index
# try multiindex with dates
tsframe = self.tsframe
old_index = tsframe.index
new_index = [old_index, np.arange(len(old_index))]
tsframe.index = MultiIndex.from_arrays(new_index)
tsframe.to_csv(path, index_label=['time', 'foo'])
recons = self.read_csv(path, index_col=[0, 1])
# TODO to_csv drops column name
assert_frame_equal(tsframe, recons, check_names=False)
# do not load index
tsframe.to_csv(path)
recons = self.read_csv(path, index_col=None)
assert len(recons.columns) == len(tsframe.columns) + 2
# no index
tsframe.to_csv(path, index=False)
recons = self.read_csv(path, index_col=None)
assert_almost_equal(recons.values, self.tsframe.values)
# needed if setUp becomes class method
self.tsframe.index = old_index
with ensure_clean('__tmp_to_csv_multiindex__') as path:
# GH3571, GH1651, GH3141
def _make_frame(names=None):
if names is True:
names = ['first', 'second']
return DataFrame(np.random.randint(0, 10, size=(3, 3)),
columns=MultiIndex.from_tuples(
[('bah', 'foo'),
('bah', 'bar'),
('ban', 'baz')], names=names),
dtype='int64')
# column & index are multi-index
df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
df.to_csv(path)
result = read_csv(path, header=[0, 1, 2, 3],
index_col=[0, 1])
assert_frame_equal(df, result)
# column is mi
df = mkdf(5, 3, r_idx_nlevels=1, c_idx_nlevels=4)
df.to_csv(path)
result = read_csv(
path, header=[0, 1, 2, 3], index_col=0)
assert_frame_equal(df, result)
# dup column names?
df = mkdf(5, 3, r_idx_nlevels=3, c_idx_nlevels=4)
df.to_csv(path)
result = read_csv(path, header=[0, 1, 2, 3],
index_col=[0, 1, 2])
assert_frame_equal(df, result)
# writing with no index
df = _make_frame()
df.to_csv(path, index=False)
result = read_csv(path, header=[0, 1])
assert_frame_equal(df, result)
# we lose the names here
df = _make_frame(True)
df.to_csv(path, index=False)
result = read_csv(path, header=[0, 1])
assert com._all_none(*result.columns.names)
result.columns.names = df.columns.names
assert_frame_equal(df, result)
# tupleize_cols=True and index=False
df = _make_frame(True)
with tm.assert_produces_warning(FutureWarning):
df.to_csv(path, tupleize_cols=True, index=False)
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = read_csv(path, header=0,
tupleize_cols=True,
index_col=None)
result.columns = df.columns
assert_frame_equal(df, result)
# whatsnew example
df = _make_frame()
df.to_csv(path)
result = read_csv(path, header=[0, 1],
index_col=[0])
assert_frame_equal(df, result)
df = _make_frame(True)
df.to_csv(path)
result = read_csv(path, header=[0, 1],
index_col=[0])
assert_frame_equal(df, result)
# column & index are multi-index (compatibility)
df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
with tm.assert_produces_warning(FutureWarning):
df.to_csv(path, tupleize_cols=True)
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = read_csv(path, header=0, index_col=[0, 1],
tupleize_cols=True)
result.columns = df.columns
assert_frame_equal(df, result)
# invalid options
df = _make_frame(True)
df.to_csv(path)
for i in [6, 7]:
msg = 'len of {i}, but only 5 lines in file'.format(i=i)
with tm.assert_raises_regex(ParserError, msg):
read_csv(path, header=lrange(i), index_col=0)
# write with cols
with tm.assert_raises_regex(TypeError, 'cannot specify cols '
'with a MultiIndex'):
df.to_csv(path, columns=['foo', 'bar'])
with ensure_clean('__tmp_to_csv_multiindex__') as path:
# empty
tsframe[:0].to_csv(path)
recons = self.read_csv(path)
exp = tsframe[:0]
exp.index = []
tm.assert_index_equal(recons.columns, exp.columns)
assert len(recons) == 0
def test_to_csv_float32_nanrep(self):
df = DataFrame(np.random.randn(1, 4).astype(np.float32))
df[1] = np.nan
with ensure_clean('__tmp_to_csv_float32_nanrep__.csv') as path:
df.to_csv(path, na_rep=999)
with open(path) as f:
lines = f.readlines()
assert lines[1].split(',')[2] == '999'
def test_to_csv_withcommas(self):
# Commas inside fields should be correctly escaped when saving as CSV.
df = DataFrame({'A': [1, 2, 3], 'B': ['5,6', '7,8', '9,0']})
with ensure_clean('__tmp_to_csv_withcommas__.csv') as path:
df.to_csv(path)
df2 = self.read_csv(path)
assert_frame_equal(df2, df)
def test_to_csv_mixed(self):
def create_cols(name):
return ["%s%03d" % (name, i) for i in range(5)]
df_float = DataFrame(np.random.randn(
100, 5), dtype='float64', columns=create_cols('float'))
df_int = DataFrame(np.random.randn(100, 5),
dtype='int64', columns=create_cols('int'))
df_bool = DataFrame(True, index=df_float.index,
columns=create_cols('bool'))
df_object = DataFrame('foo', index=df_float.index,
columns=create_cols('object'))
df_dt = DataFrame(Timestamp('20010101'),
index=df_float.index, columns=create_cols('date'))
# add in some nans
df_float.loc[30:50, 1:3] = np.nan
# ## this is a bug in read_csv right now ####
# df_dt.loc[30:50,1:3] = np.nan
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
# dtype
dtypes = dict()
for n, dtype in [('float', np.float64), ('int', np.int64),
('bool', np.bool), ('object', np.object)]:
for c in create_cols(n):
dtypes[c] = dtype
with ensure_clean() as filename:
df.to_csv(filename)
rs = read_csv(filename, index_col=0, dtype=dtypes,
parse_dates=create_cols('date'))
assert_frame_equal(rs, df)
def test_to_csv_dups_cols(self):
df = DataFrame(np.random.randn(1000, 30), columns=lrange(
15) + lrange(15), dtype='float64')
with ensure_clean() as filename:
df.to_csv(filename) # single dtype, fine
result = read_csv(filename, index_col=0)
result.columns = df.columns
assert_frame_equal(result, df)
df_float = DataFrame(np.random.randn(1000, 3), dtype='float64')
df_int = DataFrame(np.random.randn(1000, 3), dtype='int64')
df_bool = DataFrame(True, index=df_float.index, columns=lrange(3))
df_object = DataFrame('foo', index=df_float.index, columns=lrange(3))
df_dt = DataFrame(Timestamp('20010101'),
index=df_float.index, columns=lrange(3))
df = pd.concat([df_float, df_int, df_bool, df_object,
df_dt], axis=1, ignore_index=True)
cols = []
for i in range(5):
cols.extend([0, 1, 2])
df.columns = cols
with ensure_clean() as filename:
df.to_csv(filename)
result = read_csv(filename, index_col=0)
# date cols
for i in ['0.4', '1.4', '2.4']:
result[i] = to_datetime(result[i])
result.columns = df.columns
assert_frame_equal(result, df)
# GH3457
from pandas.util.testing import makeCustomDataframe as mkdf
N = 10
df = mkdf(N, 3)
df.columns = ['a', 'a', 'b']
with ensure_clean() as filename:
df.to_csv(filename)
# read_csv will rename the dups columns
result = read_csv(filename, index_col=0)
result = result.rename(columns={'a.1': 'a'})
assert_frame_equal(result, df)
def test_to_csv_chunking(self):
aa = DataFrame({'A': lrange(100000)})
aa['B'] = aa.A + 1.0
aa['C'] = aa.A + 2.0
aa['D'] = aa.A + 3.0
for chunksize in [10000, 50000, 100000]:
with ensure_clean() as filename:
aa.to_csv(filename, chunksize=chunksize)
rs = read_csv(filename, index_col=0)
assert_frame_equal(rs, aa)
@pytest.mark.slow
def test_to_csv_wide_frame_formatting(self):
# Issue #8621
df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
with ensure_clean() as filename:
df.to_csv(filename, header=False, index=False)
rs = read_csv(filename, header=None)
assert_frame_equal(rs, df)
def test_to_csv_bug(self):
f1 = StringIO('a,1.0\nb,2.0')
df = self.read_csv(f1, header=None)
newdf = DataFrame({'t': df[df.columns[0]]})
with ensure_clean() as path:
newdf.to_csv(path)
recons = read_csv(path, index_col=0)
# don't check_names as t != 1
assert_frame_equal(recons, newdf, check_names=False)
def test_to_csv_unicode(self):
df = DataFrame({u('c/\u03c3'): [1, 2, 3]})
with ensure_clean() as path:
df.to_csv(path, encoding='UTF-8')
df2 = read_csv(path, index_col=0, encoding='UTF-8')
assert_frame_equal(df, df2)
df.to_csv(path, encoding='UTF-8', index=False)
df2 = read_csv(path, index_col=None, encoding='UTF-8')
assert_frame_equal(df, df2)
def test_to_csv_unicode_index_col(self):
buf = StringIO('')
df = DataFrame(
[[u("\u05d0"), "d2", "d3", "d4"], ["a1", "a2", "a3", "a4"]],
columns=[u("\u05d0"),
u("\u05d1"), u("\u05d2"), u("\u05d3")],
index=[u("\u05d0"), u("\u05d1")])
df.to_csv(buf, encoding='UTF-8')
buf.seek(0)
df2 = read_csv(buf, index_col=0, encoding='UTF-8')
assert_frame_equal(df, df2)
def test_to_csv_stringio(self):
buf = StringIO()
self.frame.to_csv(buf)
buf.seek(0)
recons = read_csv(buf, index_col=0)
# TODO to_csv drops column name
assert_frame_equal(recons, self.frame, check_names=False)
def test_to_csv_float_format(self):
df = DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])
with ensure_clean() as filename:
df.to_csv(filename, float_format='%.2f')
rs = read_csv(filename, index_col=0)
xp = DataFrame([[0.12, 0.23, 0.57],
[12.32, 123123.20, 321321.20]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])
assert_frame_equal(rs, xp)
def test_to_csv_unicodewriter_quoting(self):
df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']})
buf = StringIO()
df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC,
encoding='utf-8')
result = buf.getvalue()
expected = ('"A","B"\n'
'1,"foo"\n'
'2,"bar"\n'
'3,"baz"\n')
assert result == expected
def test_to_csv_quote_none(self):
# GH4328
df = DataFrame({'A': ['hello', '{"hello"}']})
for encoding in (None, 'utf-8'):
buf = StringIO()
df.to_csv(buf, quoting=csv.QUOTE_NONE,
encoding=encoding, index=False)
result = buf.getvalue()
expected = 'A\nhello\n{"hello"}\n'
assert result == expected
def test_to_csv_index_no_leading_comma(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
index=['one', 'two', 'three'])
buf = StringIO()
df.to_csv(buf, index_label=False)
expected = ('A,B\n'
'one,1,4\n'
'two,2,5\n'
'three,3,6\n')
assert buf.getvalue() == expected
def test_to_csv_line_terminators(self):
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
index=['one', 'two', 'three'])
buf = StringIO()
df.to_csv(buf, line_terminator='\r\n')
expected = (',A,B\r\n'
'one,1,4\r\n'
'two,2,5\r\n'
'three,3,6\r\n')
assert buf.getvalue() == expected
buf = StringIO()
df.to_csv(buf) # The default line terminator remains \n
expected = (',A,B\n'
'one,1,4\n'
'two,2,5\n'
'three,3,6\n')
assert buf.getvalue() == expected
def test_to_csv_from_csv_categorical(self):
# CSV with categoricals should result in the same output as when one
# would add a "normal" Series/DataFrame.
s = Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
res = StringIO()
s.to_csv(res)
exp = StringIO()
s2.to_csv(exp)
assert res.getvalue() == exp.getvalue()
df = DataFrame({"s": s})
df2 = DataFrame({"s": s2})
res = StringIO()
df.to_csv(res)
exp = StringIO()
df2.to_csv(exp)
assert res.getvalue() == exp.getvalue()
def test_to_csv_path_is_none(self):
# GH 8215
# Make sure we return string for consistency with
# Series.to_csv()
csv_str = self.frame.to_csv(path_or_buf=None)
assert isinstance(csv_str, str)
recons = pd.read_csv(StringIO(csv_str), index_col=0)
assert_frame_equal(self.frame, recons)
@pytest.mark.parametrize('df,encoding', [
(DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
# GH 21241, 21118
(DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
(DataFrame(5 * [[123, u"你好", u"世界"]],
columns=['X', 'Y', 'Z']), 'gb2312'),
(DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
columns=['X', 'Y', 'Z']), 'cp737')
])
def test_to_csv_compression(self, df, encoding, compression):
with ensure_clean() as filename:
df.to_csv(filename, compression=compression, encoding=encoding)
# test the round trip - to_csv -> read_csv
result = read_csv(filename, compression=compression,
index_col=0, encoding=encoding)
assert_frame_equal(df, result)
# test the round trip using file handle - to_csv -> read_csv
f, _handles = _get_handle(filename, 'w', compression=compression,
encoding=encoding)
with f:
df.to_csv(f, encoding=encoding)
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_frame_equal(df, result)
# explicitly make sure file is compressed
with tm.decompress_file(filename, compression) as fh:
text = fh.read().decode(encoding or 'utf8')
for col in df.columns:
assert col in text
with tm.decompress_file(filename, compression) as fh:
assert_frame_equal(df, read_csv(fh,
index_col=0,
encoding=encoding))
def test_to_csv_date_format(self):
with ensure_clean('__tmp_to_csv_date_format__') as path:
dt_index = self.tsframe.index
datetime_frame = DataFrame(
{'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index)
datetime_frame.to_csv(path, date_format='%Y%m%d')
# Check that the data was put in the specified format
test = read_csv(path, index_col=0)
datetime_frame_int = datetime_frame.applymap(
lambda x: int(x.strftime('%Y%m%d')))
datetime_frame_int.index = datetime_frame_int.index.map(
lambda x: int(x.strftime('%Y%m%d')))
assert_frame_equal(test, datetime_frame_int)
datetime_frame.to_csv(path, date_format='%Y-%m-%d')
# Check that the data was put in the specified format
test = read_csv(path, index_col=0)
datetime_frame_str = datetime_frame.applymap(
lambda x: x.strftime('%Y-%m-%d'))
datetime_frame_str.index = datetime_frame_str.index.map(
lambda x: x.strftime('%Y-%m-%d'))
assert_frame_equal(test, datetime_frame_str)
# Check that columns get converted
datetime_frame_columns = datetime_frame.T
datetime_frame_columns.to_csv(path, date_format='%Y%m%d')
test = read_csv(path, index_col=0)
datetime_frame_columns = datetime_frame_columns.applymap(
lambda x: int(x.strftime('%Y%m%d')))
# Columns don't get converted to ints by read_csv
datetime_frame_columns.columns = (
datetime_frame_columns.columns
.map(lambda x: x.strftime('%Y%m%d')))
assert_frame_equal(test, datetime_frame_columns)
# test NaTs
nat_index = to_datetime(
['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000'])
nat_frame = DataFrame({'A': nat_index}, index=nat_index)
nat_frame.to_csv(path, date_format='%Y-%m-%d')
test = read_csv(path, parse_dates=[0, 1], index_col=0)
assert_frame_equal(test, nat_frame)
def test_to_csv_with_dst_transitions(self):
with ensure_clean('csv_date_format_with_dst') as path:
# make sure we are not failing on transitions
times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00",
tz="Europe/London",
freq="H",
ambiguous='infer')
for i in [times, times + pd.Timedelta('10s')]:
time_range = np.array(range(len(i)), dtype='int64')
df = DataFrame({'A': time_range}, index=i)
df.to_csv(path, index=True)
# we have to reconvert the index as we
# don't parse the tz's
result = read_csv(path, index_col=0)
result.index = to_datetime(result.index).tz_localize(
'UTC').tz_convert('Europe/London')
assert_frame_equal(result, df)
# GH11619
idx = pd.date_range('2015-01-01', '2015-12-31',
freq='H', tz='Europe/Paris')
df = DataFrame({'values': 1, 'idx': idx},
index=idx)
with ensure_clean('csv_date_format_with_dst') as path:
df.to_csv(path, index=True)
result = read_csv(path, index_col=0)
result.index = to_datetime(result.index).tz_localize(
'UTC').tz_convert('Europe/Paris')
result['idx'] = to_datetime(result['idx']).astype(
'datetime64[ns, Europe/Paris]')
assert_frame_equal(result, df)
# assert working
df.astype(str)
with ensure_clean('csv_date_format_with_dst') as path:
df.to_pickle(path)
result = pd.read_pickle(path)
assert_frame_equal(result, df)
def test_to_csv_quoting(self):
df = DataFrame({
'c_bool': [True, False],
'c_float': [1.0, 3.2],
'c_int': [42, np.nan],
'c_string': ['a', 'b,c'],
})
expected = """\
,c_bool,c_float,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,"b,c"
"""
result = df.to_csv()
assert result == expected
result = df.to_csv(quoting=None)
assert result == expected
result = df.to_csv(quoting=csv.QUOTE_MINIMAL)
assert result == expected
expected = """\
"","c_bool","c_float","c_int","c_string"
"0","True","1.0","42.0","a"
"1","False","3.2","","b,c"
"""
result = df.to_csv(quoting=csv.QUOTE_ALL)
assert result == expected
# see gh-12922, gh-13259: make sure changes to
# the formatters do not break this behaviour
expected = """\
"","c_bool","c_float","c_int","c_string"
0,True,1.0,42.0,"a"
1,False,3.2,"","b,c"
"""
result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC)
assert result == expected
msg = "need to escape, but no escapechar set"
tm.assert_raises_regex(csv.Error, msg, df.to_csv,
quoting=csv.QUOTE_NONE)
tm.assert_raises_regex(csv.Error, msg, df.to_csv,
quoting=csv.QUOTE_NONE,
escapechar=None)
expected = """\
,c_bool,c_float,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,b!,c
"""
result = df.to_csv(quoting=csv.QUOTE_NONE,
escapechar='!')
assert result == expected
expected = """\
,c_bool,c_ffloat,c_int,c_string
0,True,1.0,42.0,a
1,False,3.2,,bf,c
"""
result = df.to_csv(quoting=csv.QUOTE_NONE,
escapechar='f')
assert result == expected
# see gh-3503: quoting Windows line terminators
# presents with encoding?
text = 'a,b,c\n1,"test \r\n",3\n'
df = pd.read_csv(StringIO(text))
buf = StringIO()
df.to_csv(buf, encoding='utf-8', index=False)
assert buf.getvalue() == text
# xref gh-7791: make sure the quoting parameter is passed through
# with multi-indexes
df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]})
df = df.set_index(['a', 'b'])
expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n'
assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
def test_period_index_date_overflow(self):
# see gh-15982
dates = ["1990-01-01", "2000-01-01", "3005-01-01"]
index = pd.PeriodIndex(dates, freq="D")
df = pd.DataFrame([4, 5, 6], index=index)
result = df.to_csv()
expected = ',0\n1990-01-01,4\n2000-01-01,5\n3005-01-01,6\n'
assert result == expected
date_format = "%m-%d-%Y"
result = df.to_csv(date_format=date_format)
expected = ',0\n01-01-1990,4\n01-01-2000,5\n01-01-3005,6\n'
assert result == expected
# Overflow with pd.NaT
dates = ["1990-01-01", pd.NaT, "3005-01-01"]
index = pd.PeriodIndex(dates, freq="D")
df = pd.DataFrame([4, 5, 6], index=index)
result = df.to_csv()
expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n'
assert result == expected
def test_multi_index_header(self):
# see gh-5539
columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2),
("b", 1), ("b", 2)])
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
df.columns = columns
header = ["a", "b", "c", "d"]
result = df.to_csv(header=header)
expected = ",a,b,c,d\n0,1,2,3,4\n1,5,6,7,8\n"
assert result == expected