laywerrobot/lib/python3.6/site-packages/pandas/tests/io/parser/usecols.py

550 lines
19 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# -*- coding: utf-8 -*-
"""
Tests the usecols functionality during parsing
for all of the parsers defined in parsers.py
"""
import pytest
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Index
from pandas._libs.tslib import Timestamp
from pandas.compat import StringIO
class UsecolsTests(object):
msg_validate_usecols_arg = ("'usecols' must either be list-like of all "
"strings, all unicode, all integers or a "
"callable.")
msg_validate_usecols_names = ("Usecols do not match columns, columns "
"expected but not found: {0}")
def test_raise_on_mixed_dtype_usecols(self):
# See gh-12678
data = """a,b,c
1000,2000,3000
4000,5000,6000
"""
usecols = [0, 'b', 2]
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(data), usecols=usecols)
def test_usecols(self):
data = """\
a,b,c
1,2,3
4,5,6
7,8,9
10,11,12"""
result = self.read_csv(StringIO(data), usecols=(1, 2))
result2 = self.read_csv(StringIO(data), usecols=('b', 'c'))
exp = self.read_csv(StringIO(data))
assert len(result.columns) == 2
assert (result['b'] == exp['b']).all()
assert (result['c'] == exp['c']).all()
tm.assert_frame_equal(result, result2)
result = self.read_csv(StringIO(data), usecols=[1, 2], header=0,
names=['foo', 'bar'])
expected = self.read_csv(StringIO(data), usecols=[1, 2])
expected.columns = ['foo', 'bar']
tm.assert_frame_equal(result, expected)
data = """\
1,2,3
4,5,6
7,8,9
10,11,12"""
result = self.read_csv(StringIO(data), names=['b', 'c'],
header=None, usecols=[1, 2])
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
header=None)
expected = expected[['b', 'c']]
tm.assert_frame_equal(result, expected)
result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
header=None, usecols=['b', 'c'])
tm.assert_frame_equal(result2, result)
# see gh-5766
result = self.read_csv(StringIO(data), names=['a', 'b'],
header=None, usecols=[0, 1])
expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'],
header=None)
expected = expected[['a', 'b']]
tm.assert_frame_equal(result, expected)
# length conflict, passed names and usecols disagree
pytest.raises(ValueError, self.read_csv, StringIO(data),
names=['a', 'b'], usecols=[1], header=None)
def test_usecols_single_string(self):
# GH 20558
data = """foo, bar, baz
1000, 2000, 3000
4000, 5000, 6000
"""
usecols = 'foo'
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(data), usecols=usecols)
def test_usecols_index_col_False(self):
# see gh-9082
s = "a,b,c,d\n1,2,3,4\n5,6,7,8"
s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8,"
cols = ['a', 'c', 'd']
expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]})
df = self.read_csv(StringIO(s), usecols=cols, index_col=False)
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(s_malformed),
usecols=cols, index_col=False)
tm.assert_frame_equal(expected, df)
def test_usecols_index_col_conflict(self):
# see gh-4201: test that index_col as integer reflects usecols
data = 'a,b,c,d\nA,a,1,one\nB,b,2,two'
expected = DataFrame({'c': [1, 2]}, index=Index(
['a', 'b'], name='b'))
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
index_col=0)
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(data), usecols=['b', 'c'],
index_col='b')
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(data), usecols=[1, 2],
index_col='b')
tm.assert_frame_equal(expected, df)
df = self.read_csv(StringIO(data), usecols=[1, 2],
index_col=0)
tm.assert_frame_equal(expected, df)
expected = DataFrame(
{'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')})
expected = expected.set_index(['b', 'c'])
df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'],
index_col=['b', 'c'])
tm.assert_frame_equal(expected, df)
def test_usecols_implicit_index_col(self):
# see gh-2654
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
result = self.read_csv(StringIO(data), usecols=['a', 'b'])
expected = DataFrame({'a': ['apple', 'orange'],
'b': ['bat', 'cow']}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_regex_sep(self):
# see gh-2733
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
df = self.read_csv(StringIO(data), sep=r'\s+', usecols=('a', 'b'))
expected = DataFrame({'a': ['apple', 'orange'],
'b': ['bat', 'cow']}, index=[4, 8])
tm.assert_frame_equal(df, expected)
def test_usecols_with_whitespace(self):
data = 'a b c\n4 apple bat 5.7\n8 orange cow 10'
result = self.read_csv(StringIO(data), delim_whitespace=True,
usecols=('a', 'b'))
expected = DataFrame({'a': ['apple', 'orange'],
'b': ['bat', 'cow']}, index=[4, 8])
tm.assert_frame_equal(result, expected)
def test_usecols_with_integer_like_header(self):
data = """2,0,1
1000,2000,3000
4000,5000,6000
"""
usecols = [0, 1] # column selection by index
expected = DataFrame(data=[[1000, 2000],
[4000, 5000]],
columns=['2', '0'])
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)
usecols = ['0', '1'] # column selection by name
expected = DataFrame(data=[[2000, 3000],
[5000, 6000]],
columns=['0', '1'])
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)
def test_usecols_with_parse_dates(self):
# See gh-9755
s = """a,b,c,d,e
0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
df = self.read_csv(StringIO(s), usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(s), usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
# See gh-13604
s = """2008-02-07 09:40,1032.43
2008-02-07 09:50,1042.54
2008-02-07 10:00,1051.65
"""
parse_dates = [0]
names = ['date', 'values']
usecols = names[:]
index = Index([Timestamp('2008-02-07 09:40'),
Timestamp('2008-02-07 09:50'),
Timestamp('2008-02-07 10:00')],
name='date')
cols = {'values': [1032.43, 1042.54, 1051.65]}
expected = DataFrame(cols, index=index)
df = self.read_csv(StringIO(s), parse_dates=parse_dates, index_col=0,
usecols=usecols, header=None, names=names)
tm.assert_frame_equal(df, expected)
# See gh-14792
s = """a,b,c,d,e,f,g,h,i,j
2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [0]
usecols = list('abcdefghij')
cols = {'a': Timestamp('2016-09-21'),
'b': [1], 'c': [1], 'd': [2],
'e': [3], 'f': [4], 'g': [5],
'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=usecols)
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8"""
parse_dates = [[0, 1]]
usecols = list('abcdefghij')
cols = {'a_b': '2016/09/21 1',
'c': [1], 'd': [2], 'e': [3], 'f': [4],
'g': [5], 'h': [6], 'i': [7], 'j': [8]}
expected = DataFrame(cols, columns=['a_b'] + list('cdefghij'))
df = self.read_csv(StringIO(s), usecols=usecols,
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
def test_usecols_with_parse_dates_and_full_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
names = list('abcde')
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
df = self.read_csv(StringIO(s), names=names,
usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(s), names=names,
usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
def test_usecols_with_parse_dates_and_usecol_names(self):
# See gh-9755
s = """0,1,20140101,0900,4
0,1,20140102,1000,4"""
parse_dates = [[1, 2]]
names = list('acd')
cols = {
'a': [0, 0],
'c_d': [
Timestamp('2014-01-01 09:00:00'),
Timestamp('2014-01-02 10:00:00')
]
}
expected = DataFrame(cols, columns=['c_d', 'a'])
df = self.read_csv(StringIO(s), names=names,
usecols=[0, 2, 3],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(s), names=names,
usecols=[3, 0, 2],
parse_dates=parse_dates)
tm.assert_frame_equal(df, expected)
def test_usecols_with_unicode_strings(self):
# see gh-13219
s = '''AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'AAA': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'BBB': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB'])
tm.assert_frame_equal(df, expected)
def test_usecols_with_single_byte_unicode_strings(self):
# see gh-13219
s = '''A,B,C,D
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'A': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'B': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=[u'A', u'B'])
tm.assert_frame_equal(df, expected)
def test_usecols_with_mixed_encoding_strings(self):
s = '''AAA,BBB,CCC,DDD
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB'])
with tm.assert_raises_regex(ValueError, self.msg_validate_usecols_arg):
self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB'])
def test_usecols_with_multibyte_characters(self):
s = '''あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'あああ': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'いい': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=['あああ', 'いい'])
tm.assert_frame_equal(df, expected)
def test_usecols_with_multibyte_unicode_characters(self):
pytest.skip('TODO: see gh-13253')
s = '''あああ,いい,ううう,ええええ
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'あああ': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'いい': {0: 8, 1: 2, 2: 7}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい'])
tm.assert_frame_equal(df, expected)
def test_empty_usecols(self):
# should not raise
data = 'a,b,c\n1,2,3\n4,5,6'
expected = DataFrame()
result = self.read_csv(StringIO(data), usecols=set([]))
tm.assert_frame_equal(result, expected)
def test_np_array_usecols(self):
# See gh-12546
data = 'a,b,c\n1,2,3'
usecols = np.array(['a', 'b'])
expected = DataFrame([[1, 2]], columns=usecols)
result = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
def test_callable_usecols(self):
# See gh-14154
s = '''AaA,bBb,CCC,ddd
0.056674973,8,True,a
2.613230982,2,False,b
3.568935038,7,False,a
'''
data = {
'AaA': {
0: 0.056674972999999997,
1: 2.6132309819999997,
2: 3.5689350380000002
},
'bBb': {0: 8, 1: 2, 2: 7},
'ddd': {0: 'a', 1: 'b', 2: 'a'}
}
expected = DataFrame(data)
df = self.read_csv(StringIO(s), usecols=lambda x:
x.upper() in ['AAA', 'BBB', 'DDD'])
tm.assert_frame_equal(df, expected)
# Check that a callable returning only False returns
# an empty DataFrame
expected = DataFrame()
df = self.read_csv(StringIO(s), usecols=lambda x: False)
tm.assert_frame_equal(df, expected)
def test_incomplete_first_row(self):
# see gh-6710
data = '1,2\n1,2,3'
names = ['a', 'b', 'c']
expected = DataFrame({'a': [1, 1],
'c': [np.nan, 3]})
usecols = ['a', 'c']
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(df, expected)
usecols = lambda x: x in ['a', 'c']
df = self.read_csv(StringIO(data), names=names, usecols=usecols)
tm.assert_frame_equal(df, expected)
def test_uneven_length_cols(self):
# see gh-8985
usecols = [0, 1, 2]
data = '19,29,39\n' * 2 + '10,20,30,40'
expected = DataFrame([[19, 29, 39],
[19, 29, 39],
[10, 20, 30]])
df = self.read_csv(StringIO(data), header=None, usecols=usecols)
tm.assert_frame_equal(df, expected)
# see gh-9549
usecols = ['A', 'B', 'C']
data = ('A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n'
'1,2,3,,,1,\n1,2,3\n5,6,7')
expected = DataFrame({'A': [1, 3, 1, 1, 1, 5],
'B': [2, 4, 2, 2, 2, 6],
'C': [3, 5, 4, 3, 3, 7]})
df = self.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(df, expected)
def test_raise_on_usecols_names_mismatch(self):
# GH 14671
data = 'a,b,c,d\n1,2,3,4\n5,6,7,8'
usecols = ['a', 'b', 'c', 'd']
df = self.read_csv(StringIO(data), usecols=usecols)
expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7],
'd': [4, 8]})
tm.assert_frame_equal(df, expected)
usecols = ['a', 'b', 'c', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)
usecols = ['a', 'b', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), usecols=usecols)
usecols = ['a', 'b', 'f', 'g']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\[('f', 'g'|'g', 'f')\]")):
self.read_csv(StringIO(data), usecols=usecols)
names = ['A', 'B', 'C', 'D']
df = self.read_csv(StringIO(data), header=0, names=names)
expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7],
'D': [4, 8]})
tm.assert_frame_equal(df, expected)
# TODO: https://github.com/pandas-dev/pandas/issues/16469
# usecols = ['A','C']
# df = self.read_csv(StringIO(data), header=0, names=names,
# usecols=usecols)
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
# tm.assert_frame_equal(df, expected)
#
# usecols = [0,2]
# df = self.read_csv(StringIO(data), header=0, names=names,
# usecols=usecols)
# expected = DataFrame({'A': [1,5], 'C': [3,7]})
# tm.assert_frame_equal(df, expected)
usecols = ['A', 'B', 'C', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), header=0, names=names,
usecols=usecols)
usecols = ['A', 'B', 'f']
with tm.assert_raises_regex(ValueError,
self.msg_validate_usecols_names.format(
r"\['f'\]")):
self.read_csv(StringIO(data), names=names, usecols=usecols)