549 lines
19 KiB
Python
549 lines
19 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Tests the usecols functionality during parsing
|
|
for all of the parsers defined in parsers.py
|
|
"""
|
|
|
|
import pytest
|
|
|
|
import numpy as np
|
|
import pandas.util.testing as tm
|
|
|
|
from pandas import DataFrame, Index
|
|
from pandas._libs.tslib import Timestamp
|
|
from pandas.compat import StringIO
|
|
|
|
|
|
class UsecolsTests(object):
|
|
msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
|
|
"strings, all unicode, all integers or a "
|
|
"callable.")
|
|
msg_validate_usecols_names = ("Usecols do not match columns, columns "
|
|
"expected but not found: {0}")
|
|
|
|
def test_raise_on_mixed_dtype_usecols(self):
|
|
# See gh-12678
|
|
data = """a,b,c
|
|
1000,2000,3000
|
|
4000,5000,6000
|
|
"""
|
|
|
|
usecols = [0, 'b', 2]
|
|
|
|
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
|
self.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
def test_usecols(self):
|
|
data = """\
|
|
a,b,c
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
10,11,12"""
|
|
|
|
result = self.read_csv(StringIO(data), usecols=(1, 2))
|
|
result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
|
|
exp = self.read_csv(StringIO(data))
|
|
|
|
assert len(result.columns) == 2
|
|
assert (result['b'] == exp['b']).all()
|
|
assert (result['c'] == exp['c']).all()
|
|
|
|
tm.assert_frame_equal(result, result2)
|
|
|
|
result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
|
|
names=['foo', 'bar'])
|
|
expected = self.read_csv(StringIO(data), usecols=[1, 2])
|
|
expected.columns = ['foo', 'bar']
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
data = """\
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
10,11,12"""
|
|
result = self.read_csv(StringIO(data), names=['b', 'c'],
|
|
header=None, usecols=[1, 2])
|
|
|
|
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
|
|
header=None)
|
|
expected = expected[['b', 'c']]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
|
|
header=None, usecols=['b', 'c'])
|
|
tm.assert_frame_equal(result2, result)
|
|
|
|
# see gh-5766
|
|
result = self.read_csv(StringIO(data), names=['a', 'b'],
|
|
header=None, usecols=[0, 1])
|
|
|
|
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
|
|
header=None)
|
|
expected = expected[['a', 'b']]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# length conflict, passed names and usecols disagree
|
|
pytest.raises(ValueError, self.read_csv, StringIO(data),
|
|
names=['a', 'b'], usecols=[1], header=None)
|
|
|
|
def test_usecols_single_string(self):
|
|
# GH 20558
|
|
data = """foo, bar, baz
|
|
1000, 2000, 3000
|
|
4000, 5000, 6000
|
|
"""
|
|
|
|
usecols = 'foo'
|
|
|
|
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
|
self.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
def test_usecols_index_col_False(self):
|
|
# see gh-9082
|
|
s = "a,b,c,d\n1,2,3,4\n5,6,7,8"
|
|
s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8,"
|
|
cols = ['a', 'c', 'd']
|
|
expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]})
|
|
df = self.read_csv(StringIO(s), usecols=cols, index_col=False)
|
|
tm.assert_frame_equal(expected, df)
|
|
df = self.read_csv(StringIO(s_malformed),
|
|
usecols=cols, index_col=False)
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
def test_usecols_index_col_conflict(self):
|
|
# see gh-4201: test that index_col as integer reflects usecols
|
|
data = 'a,b,c,d\nA,a,1,one\nB,b,2,two'
|
|
expected = DataFrame({'c': [1, 2]}, index=Index(
|
|
['a', 'b'], name='b'))
|
|
|
|
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
|
|
index_col=0)
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
|
|
index_col='b')
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
df = self.read_csv(StringIO(data), usecols=[1, 2],
|
|
index_col='b')
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
df = self.read_csv(StringIO(data), usecols=[1, 2],
|
|
index_col=0)
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
expected = DataFrame(
|
|
{'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')})
|
|
expected = expected.set_index(['b', 'c'])
|
|
df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'],
|
|
index_col=['b', 'c'])
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
def test_usecols_implicit_index_col(self):
|
|
# see gh-2654
|
|
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
|
|
|
|
result = self.read_csv(StringIO(data), usecols=['a', 'b'])
|
|
expected = DataFrame({'a': ['apple', 'orange'],
|
|
'b': ['bat', 'cow']}, index=[4, 8])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_usecols_regex_sep(self):
|
|
# see gh-2733
|
|
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
|
|
|
|
df = self.read_csv(StringIO(data), sep=r'\s+', usecols=('a', 'b'))
|
|
|
|
expected = DataFrame({'a': ['apple', 'orange'],
|
|
'b': ['bat', 'cow']}, index=[4, 8])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_whitespace(self):
|
|
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
|
|
|
|
result = self.read_csv(StringIO(data), delim_whitespace=True,
|
|
usecols=('a', 'b'))
|
|
expected = DataFrame({'a': ['apple', 'orange'],
|
|
'b': ['bat', 'cow']}, index=[4, 8])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_usecols_with_integer_like_header(self):
|
|
data = """2,0,1
|
|
1000,2000,3000
|
|
4000,5000,6000
|
|
"""
|
|
|
|
usecols = [0, 1] # column selection by index
|
|
expected = DataFrame(data=[[1000, 2000],
|
|
[4000, 5000]],
|
|
columns=['2', '0'])
|
|
df = self.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
usecols = ['0', '1'] # column selection by name
|
|
expected = DataFrame(data=[[2000, 3000],
|
|
[5000, 6000]],
|
|
columns=['0', '1'])
|
|
df = self.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_parse_dates(self):
|
|
# See gh-9755
|
|
s = """a,b,c,d,e
|
|
0,1,20140101,0900,4
|
|
0,1,20140102,1000,4"""
|
|
parse_dates = [[1, 2]]
|
|
|
|
cols = {
|
|
'a': [0, 0],
|
|
'c_d': [
|
|
Timestamp('2014-01-01 09:00:00'),
|
|
Timestamp('2014-01-02 10:00:00')
|
|
]
|
|
}
|
|
expected = DataFrame(cols, columns=['c_d', 'a'])
|
|
|
|
df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# See gh-13604
|
|
s = """2008-02-07 09:40,1032.43
|
|
2008-02-07 09:50,1042.54
|
|
2008-02-07 10:00,1051.65
|
|
"""
|
|
parse_dates = [0]
|
|
names = ['date', 'values']
|
|
usecols = names[:]
|
|
|
|
index = Index([Timestamp('2008-02-07 09:40'),
|
|
Timestamp('2008-02-07 09:50'),
|
|
Timestamp('2008-02-07 10:00')],
|
|
name='date')
|
|
cols = {'values': [1032.43, 1042.54, 1051.65]}
|
|
expected = DataFrame(cols, index=index)
|
|
|
|
df = self.read_csv(StringIO(s), parse_dates=parse_dates, index_col=0,
|
|
usecols=usecols, header=None, names=names)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# See gh-14792
|
|
s = """a,b,c,d,e,f,g,h,i,j
|
|
2016/09/21,1,1,2,3,4,5,6,7,8"""
|
|
parse_dates = [0]
|
|
usecols = list('abcdefghij')
|
|
cols = {'a': Timestamp('2016-09-21'),
|
|
'b': [1], 'c': [1], 'd': [2],
|
|
'e': [3], 'f': [4], 'g': [5],
|
|
'h': [6], 'i': [7], 'j': [8]}
|
|
expected = DataFrame(cols, columns=usecols)
|
|
df = self.read_csv(StringIO(s), usecols=usecols,
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
|
|
parse_dates = [[0, 1]]
|
|
usecols = list('abcdefghij')
|
|
cols = {'a_b': '2016/09/21 1',
|
|
'c': [1], 'd': [2], 'e': [3], 'f': [4],
|
|
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
|
|
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
|
|
df = self.read_csv(StringIO(s), usecols=usecols,
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_parse_dates_and_full_names(self):
|
|
# See gh-9755
|
|
s = """0,1,20140101,0900,4
|
|
0,1,20140102,1000,4"""
|
|
parse_dates = [[1, 2]]
|
|
names = list('abcde')
|
|
|
|
cols = {
|
|
'a': [0, 0],
|
|
'c_d': [
|
|
Timestamp('2014-01-01 09:00:00'),
|
|
Timestamp('2014-01-02 10:00:00')
|
|
]
|
|
}
|
|
expected = DataFrame(cols, columns=['c_d', 'a'])
|
|
|
|
df = self.read_csv(StringIO(s), names=names,
|
|
usecols=[0, 2, 3],
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_csv(StringIO(s), names=names,
|
|
usecols=[3, 0, 2],
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_parse_dates_and_usecol_names(self):
|
|
# See gh-9755
|
|
s = """0,1,20140101,0900,4
|
|
0,1,20140102,1000,4"""
|
|
parse_dates = [[1, 2]]
|
|
names = list('acd')
|
|
|
|
cols = {
|
|
'a': [0, 0],
|
|
'c_d': [
|
|
Timestamp('2014-01-01 09:00:00'),
|
|
Timestamp('2014-01-02 10:00:00')
|
|
]
|
|
}
|
|
expected = DataFrame(cols, columns=['c_d', 'a'])
|
|
|
|
df = self.read_csv(StringIO(s), names=names,
|
|
usecols=[0, 2, 3],
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_csv(StringIO(s), names=names,
|
|
usecols=[3, 0, 2],
|
|
parse_dates=parse_dates)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_unicode_strings(self):
|
|
# see gh-13219
|
|
|
|
s = '''AAA,BBB,CCC,DDD
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a
|
|
'''
|
|
|
|
data = {
|
|
'AAA': {
|
|
0: 0.056674972999999997,
|
|
1: 2.6132309819999997,
|
|
2: 3.5689350380000002
|
|
},
|
|
'BBB': {0: 8, 1: 2, 2: 7}
|
|
}
|
|
expected = DataFrame(data)
|
|
|
|
df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_single_byte_unicode_strings(self):
|
|
# see gh-13219
|
|
|
|
s = '''A,B,C,D
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a
|
|
'''
|
|
|
|
data = {
|
|
'A': {
|
|
0: 0.056674972999999997,
|
|
1: 2.6132309819999997,
|
|
2: 3.5689350380000002
|
|
},
|
|
'B': {0: 8, 1: 2, 2: 7}
|
|
}
|
|
expected = DataFrame(data)
|
|
|
|
df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_mixed_encoding_strings(self):
|
|
s = '''AAA,BBB,CCC,DDD
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a
|
|
'''
|
|
|
|
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
|
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
|
|
|
|
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
|
|
self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
|
|
|
|
def test_usecols_with_multibyte_characters(self):
|
|
s = '''あああ,いい,ううう,ええええ
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a
|
|
'''
|
|
data = {
|
|
'あああ': {
|
|
0: 0.056674972999999997,
|
|
1: 2.6132309819999997,
|
|
2: 3.5689350380000002
|
|
},
|
|
'いい': {0: 8, 1: 2, 2: 7}
|
|
}
|
|
expected = DataFrame(data)
|
|
|
|
df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_usecols_with_multibyte_unicode_characters(self):
|
|
pytest.skip('TODO: see gh-13253')
|
|
|
|
s = '''あああ,いい,ううう,ええええ
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a
|
|
'''
|
|
data = {
|
|
'あああ': {
|
|
0: 0.056674972999999997,
|
|
1: 2.6132309819999997,
|
|
2: 3.5689350380000002
|
|
},
|
|
'いい': {0: 8, 1: 2, 2: 7}
|
|
}
|
|
expected = DataFrame(data)
|
|
|
|
df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_empty_usecols(self):
|
|
# should not raise
|
|
data = 'a,b,c\n1,2,3\n4,5,6'
|
|
expected = DataFrame()
|
|
result = self.read_csv(StringIO(data), usecols=set([]))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_np_array_usecols(self):
|
|
# See gh-12546
|
|
data = 'a,b,c\n1,2,3'
|
|
usecols = np.array(['a', 'b'])
|
|
|
|
expected = DataFrame([[1, 2]], columns=usecols)
|
|
result = self.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_callable_usecols(self):
|
|
# See gh-14154
|
|
s = '''AaA,bBb,CCC,ddd
|
|
0.056674973,8,True,a
|
|
2.613230982,2,False,b
|
|
3.568935038,7,False,a
|
|
'''
|
|
|
|
data = {
|
|
'AaA': {
|
|
0: 0.056674972999999997,
|
|
1: 2.6132309819999997,
|
|
2: 3.5689350380000002
|
|
},
|
|
'bBb': {0: 8, 1: 2, 2: 7},
|
|
'ddd': {0: 'a', 1: 'b', 2: 'a'}
|
|
}
|
|
expected = DataFrame(data)
|
|
df = self.read_csv(StringIO(s), usecols=lambda x:
|
|
x.upper() in ['AAA', 'BBB', 'DDD'])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# Check that a callable returning only False returns
|
|
# an empty DataFrame
|
|
expected = DataFrame()
|
|
df = self.read_csv(StringIO(s), usecols=lambda x: False)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_incomplete_first_row(self):
|
|
# see gh-6710
|
|
data = '1,2\n1,2,3'
|
|
names = ['a', 'b', 'c']
|
|
expected = DataFrame({'a': [1, 1],
|
|
'c': [np.nan, 3]})
|
|
|
|
usecols = ['a', 'c']
|
|
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
usecols = lambda x: x in ['a', 'c']
|
|
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_uneven_length_cols(self):
|
|
# see gh-8985
|
|
usecols = [0, 1, 2]
|
|
data = '19,29,39\n' * 2 + '10,20,30,40'
|
|
expected = DataFrame([[19, 29, 39],
|
|
[19, 29, 39],
|
|
[10, 20, 30]])
|
|
df = self.read_csv(StringIO(data), header=None, usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# see gh-9549
|
|
usecols = ['A', 'B', 'C']
|
|
data = ('A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n'
|
|
'1,2,3,,,1,\n1,2,3\n5,6,7')
|
|
expected = DataFrame({'A': [1, 3, 1, 1, 1, 5],
|
|
'B': [2, 4, 2, 2, 2, 6],
|
|
'C': [3, 5, 4, 3, 3, 7]})
|
|
df = self.read_csv(StringIO(data), usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_raise_on_usecols_names_mismatch(self):
|
|
# GH 14671
|
|
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
|
|
|
|
usecols = ['a', 'b', 'c', 'd']
|
|
df = self.read_csv(StringIO(data), usecols=usecols)
|
|
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
|
|
'd': [4, 8]})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
usecols = ['a', 'b', 'c', 'f']
|
|
with tm.assert_raises_regex(ValueError,
|
|
self.msg_validate_usecols_names.format(
|
|
r"\['f'\]")):
|
|
self.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
usecols = ['a', 'b', 'f']
|
|
with tm.assert_raises_regex(ValueError,
|
|
self.msg_validate_usecols_names.format(
|
|
r"\['f'\]")):
|
|
self.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
usecols = ['a', 'b', 'f', 'g']
|
|
with tm.assert_raises_regex(ValueError,
|
|
self.msg_validate_usecols_names.format(
|
|
r"\[('f', 'g'|'g', 'f')\]")):
|
|
self.read_csv(StringIO(data), usecols=usecols)
|
|
|
|
names = ['A', 'B', 'C', 'D']
|
|
|
|
df = self.read_csv(StringIO(data), header=0, names=names)
|
|
expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
|
|
'D': [4, 8]})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# TODO: https://github.com/pandas-dev/pandas/issues/16469
|
|
# usecols = ['A','C']
|
|
# df = self.read_csv(StringIO(data), header=0, names=names,
|
|
# usecols=usecols)
|
|
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
|
|
# tm.assert_frame_equal(df, expected)
|
|
#
|
|
# usecols = [0,2]
|
|
# df = self.read_csv(StringIO(data), header=0, names=names,
|
|
# usecols=usecols)
|
|
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
|
|
# tm.assert_frame_equal(df, expected)
|
|
|
|
usecols = ['A', 'B', 'C', 'f']
|
|
with tm.assert_raises_regex(ValueError,
|
|
self.msg_validate_usecols_names.format(
|
|
r"\['f'\]")):
|
|
self.read_csv(StringIO(data), header=0, names=names,
|
|
usecols=usecols)
|
|
usecols = ['A', 'B', 'f']
|
|
with tm.assert_raises_regex(ValueError,
|
|
self.msg_validate_usecols_names.format(
|
|
r"\['f'\]")):
|
|
self.read_csv(StringIO(data), names=names, usecols=usecols)
|