89 lines
3.5 KiB
Python
89 lines
3.5 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
Tests that duplicate columns are handled appropriately when parsed by the
|
||
|
CSV engine. In general, the expected result is that they are either thoroughly
|
||
|
de-duplicated (if mangling requested) or ignored otherwise.
|
||
|
"""
|
||
|
|
||
|
from pandas.compat import StringIO
|
||
|
from pandas import DataFrame
|
||
|
|
||
|
import pandas.util.testing as tm
|
||
|
|
||
|
|
||
|
class DupeColumnTests(object):
|
||
|
def test_basic(self):
|
||
|
# TODO: add test for condition "mangle_dupe_cols=False"
|
||
|
# once it is actually supported (gh-12935)
|
||
|
data = "a,a,b,b,b\n1,2,3,4,5"
|
||
|
|
||
|
for method in ("read_csv", "read_table"):
|
||
|
# Check default behavior.
|
||
|
expected = ["a", "a.1", "b", "b.1", "b.2"]
|
||
|
df = getattr(self, method)(StringIO(data), sep=",")
|
||
|
assert list(df.columns) == expected
|
||
|
|
||
|
df = getattr(self, method)(StringIO(data), sep=",",
|
||
|
mangle_dupe_cols=True)
|
||
|
assert list(df.columns) == expected
|
||
|
|
||
|
def test_basic_names(self):
|
||
|
# See gh-7160
|
||
|
data = "a,b,a\n0,1,2\n3,4,5"
|
||
|
expected = DataFrame([[0, 1, 2], [3, 4, 5]],
|
||
|
columns=["a", "b", "a.1"])
|
||
|
|
||
|
df = self.read_csv(StringIO(data))
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||
|
data = "0,1,2\n3,4,5"
|
||
|
df = self.read_csv(StringIO(data),
|
||
|
names=["a", "b", "a"])
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_thorough_mangle_columns(self):
|
||
|
# see gh-17060
|
||
|
data = "a,a,a.1\n1,2,3"
|
||
|
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
|
||
|
assert list(df.columns) == ["a", "a.1", "a.1.1"]
|
||
|
|
||
|
data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
|
||
|
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
|
||
|
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
|
||
|
"a.1.1.1.1", "a.1.1.1.1.1"]
|
||
|
|
||
|
data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
|
||
|
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
|
||
|
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
|
||
|
"a.2", "a.2.1", "a.3.1"]
|
||
|
|
||
|
def test_thorough_mangle_names(self):
|
||
|
# see gh-17095
|
||
|
data = "a,b,b\n1,2,3"
|
||
|
names = ["a.1", "a.1", "a.1.1"]
|
||
|
|
||
|
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||
|
df = self.read_csv(StringIO(data), sep=",", names=names,
|
||
|
mangle_dupe_cols=True)
|
||
|
assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"]
|
||
|
|
||
|
data = "a,b,c,d,e,f\n1,2,3,4,5,6"
|
||
|
names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"]
|
||
|
|
||
|
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||
|
df = self.read_csv(StringIO(data), sep=",", names=names,
|
||
|
mangle_dupe_cols=True)
|
||
|
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
|
||
|
"a.1.1.1.1", "a.1.1.1.1.1"]
|
||
|
|
||
|
data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7"
|
||
|
names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"]
|
||
|
|
||
|
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
|
||
|
df = self.read_csv(StringIO(data), sep=",", names=names,
|
||
|
mangle_dupe_cols=True)
|
||
|
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
|
||
|
"a.2", "a.2.1", "a.3.1"]
|