371 lines
12 KiB
Python
371 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Tests that NA values are properly handled during
|
|
parsing for all of the parsers defined in parsers.py
|
|
"""
|
|
|
|
import numpy as np
|
|
from numpy import nan
|
|
|
|
import pandas.io.common as com
|
|
import pandas.util.testing as tm
|
|
|
|
from pandas import DataFrame, Index, MultiIndex
|
|
from pandas.compat import StringIO, range
|
|
|
|
|
|
class NAvaluesTests(object):
|
|
|
|
def test_string_nas(self):
|
|
data = """A,B,C
|
|
a,b,c
|
|
d,,f
|
|
,g,h
|
|
"""
|
|
result = self.read_csv(StringIO(data))
|
|
expected = DataFrame([['a', 'b', 'c'],
|
|
['d', np.nan, 'f'],
|
|
[np.nan, 'g', 'h']],
|
|
columns=['A', 'B', 'C'])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_detect_string_na(self):
|
|
data = """A,B
|
|
foo,bar
|
|
NA,baz
|
|
NaN,nan
|
|
"""
|
|
expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]],
|
|
dtype=np.object_)
|
|
df = self.read_csv(StringIO(data))
|
|
tm.assert_numpy_array_equal(df.values, expected)
|
|
|
|
def test_non_string_na_values(self):
|
|
# see gh-3611: with an odd float format, we can't match
|
|
# the string '999.0' exactly but still need float matching
|
|
nice = """A,B
|
|
-999,1.2
|
|
2,-999
|
|
3,4.5
|
|
"""
|
|
ugly = """A,B
|
|
-999,1.200
|
|
2,-999.000
|
|
3,4.500
|
|
"""
|
|
na_values_param = [['-999.0', '-999'],
|
|
[-999, -999.0],
|
|
[-999.0, -999],
|
|
['-999.0'], ['-999'],
|
|
[-999.0], [-999]]
|
|
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
|
|
[3.0, 4.5]], columns=['A', 'B'])
|
|
|
|
for data in (nice, ugly):
|
|
for na_values in na_values_param:
|
|
out = self.read_csv(StringIO(data), na_values=na_values)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_default_na_values(self):
|
|
_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
|
|
'#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null',
|
|
'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', ''])
|
|
assert _NA_VALUES == com._NA_VALUES
|
|
nv = len(_NA_VALUES)
|
|
|
|
def f(i, v):
|
|
if i == 0:
|
|
buf = ''
|
|
elif i > 0:
|
|
buf = ''.join([','] * i)
|
|
|
|
buf = "{0}{1}".format(buf, v)
|
|
|
|
if i < nv - 1:
|
|
buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1)))
|
|
|
|
return buf
|
|
|
|
data = StringIO('\n'.join(f(i, v) for i, v in enumerate(_NA_VALUES)))
|
|
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
|
|
df = self.read_csv(data, header=None)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_custom_na_values(self):
|
|
data = """A,B,C
|
|
ignore,this,row
|
|
1,NA,3
|
|
-1.#IND,5,baz
|
|
7,8,NaN
|
|
"""
|
|
expected = np.array([[1., nan, 3],
|
|
[nan, 5, nan],
|
|
[7, 8, nan]])
|
|
|
|
df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
|
|
tm.assert_numpy_array_equal(df.values, expected)
|
|
|
|
df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
|
|
skiprows=[1])
|
|
tm.assert_numpy_array_equal(df2.values, expected)
|
|
|
|
df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
|
|
skiprows=[1])
|
|
tm.assert_numpy_array_equal(df3.values, expected)
|
|
|
|
def test_bool_na_values(self):
|
|
data = """A,B,C
|
|
True,False,True
|
|
NA,True,False
|
|
False,NA,True"""
|
|
|
|
result = self.read_csv(StringIO(data))
|
|
expected = DataFrame({'A': np.array([True, nan, False], dtype=object),
|
|
'B': np.array([False, True, nan], dtype=object),
|
|
'C': [True, False, True]})
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_na_value_dict(self):
|
|
data = """A,B,C
|
|
foo,bar,NA
|
|
bar,foo,foo
|
|
foo,bar,NA
|
|
bar,foo,foo"""
|
|
|
|
df = self.read_csv(StringIO(data),
|
|
na_values={'A': ['foo'], 'B': ['bar']})
|
|
expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
|
|
'B': [np.nan, 'foo', np.nan, 'foo'],
|
|
'C': [np.nan, 'foo', np.nan, 'foo']})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
data = """\
|
|
a,b,c,d
|
|
0,NA,1,5
|
|
"""
|
|
xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
|
|
xp.index.name = 'a'
|
|
df = self.read_csv(StringIO(data), na_values={}, index_col=0)
|
|
tm.assert_frame_equal(df, xp)
|
|
|
|
xp = DataFrame({'b': [np.nan], 'd': [5]},
|
|
MultiIndex.from_tuples([(0, 1)]))
|
|
xp.index.names = ['a', 'c']
|
|
df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2])
|
|
tm.assert_frame_equal(df, xp)
|
|
|
|
xp = DataFrame({'b': [np.nan], 'd': [5]},
|
|
MultiIndex.from_tuples([(0, 1)]))
|
|
xp.index.names = ['a', 'c']
|
|
df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
|
|
tm.assert_frame_equal(df, xp)
|
|
|
|
def test_na_values_keep_default(self):
|
|
data = """\
|
|
One,Two,Three
|
|
a,1,one
|
|
b,2,two
|
|
,3,three
|
|
d,4,nan
|
|
e,5,five
|
|
nan,6,
|
|
g,7,seven
|
|
"""
|
|
df = self.read_csv(StringIO(data))
|
|
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
|
|
'Two': [1, 2, 3, 4, 5, 6, 7],
|
|
'Three': ['one', 'two', 'three', np.nan, 'five',
|
|
np.nan, 'seven']})
|
|
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
|
|
|
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
|
|
keep_default_na=False)
|
|
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
|
|
'Two': [1, 2, 3, 4, 5, 6, 7],
|
|
'Three': ['one', 'two', 'three', 'nan', 'five',
|
|
'', 'seven']})
|
|
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
|
|
|
df = self.read_csv(
|
|
StringIO(data), na_values=['a'], keep_default_na=False)
|
|
xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
|
|
'Two': [1, 2, 3, 4, 5, 6, 7],
|
|
'Three': ['one', 'two', 'three', 'nan', 'five', '',
|
|
'seven']})
|
|
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
|
|
|
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
|
|
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
|
|
'Two': [1, 2, 3, 4, 5, 6, 7],
|
|
'Three': ['one', 'two', 'three', np.nan, 'five',
|
|
np.nan, 'seven']})
|
|
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
|
|
|
# see gh-4318: passing na_values=None and
|
|
# keep_default_na=False yields 'None' as a na_value
|
|
data = """\
|
|
One,Two,Three
|
|
a,1,None
|
|
b,2,two
|
|
,3,None
|
|
d,4,nan
|
|
e,5,five
|
|
nan,6,
|
|
g,7,seven
|
|
"""
|
|
df = self.read_csv(
|
|
StringIO(data), keep_default_na=False)
|
|
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
|
|
'Two': [1, 2, 3, 4, 5, 6, 7],
|
|
'Three': ['None', 'two', 'None', 'nan', 'five', '',
|
|
'seven']})
|
|
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
|
|
|
|
def test_no_keep_default_na_dict_na_values(self):
|
|
# see gh-19227
|
|
data = "a,b\n,2"
|
|
|
|
df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
|
|
keep_default_na=False)
|
|
expected = DataFrame({"a": [""], "b": [np.nan]})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# Scalar values shouldn't cause the parsing to crash or fail.
|
|
data = "a,b\n1,2"
|
|
|
|
df = self.read_csv(StringIO(data), na_values={"b": 2},
|
|
keep_default_na=False)
|
|
expected = DataFrame({"a": [1], "b": [np.nan]})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
data = """\
|
|
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
|
|
729639,"qwer","",asdfkj,466.681,,252.373
|
|
"""
|
|
expected = DataFrame({0: [np.nan, 729639.0],
|
|
1: [np.nan, "qwer"],
|
|
2: ["/blaha", np.nan],
|
|
3: ["kjsdkj", "asdfkj"],
|
|
4: [412.166, 466.681],
|
|
5: ["225.874", ""],
|
|
6: [np.nan, 252.373]})
|
|
|
|
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
|
|
na_values={2: "", 6: "214.008",
|
|
1: "blah", 0: 113125})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
|
|
na_values={2: "", 6: "214.008",
|
|
1: "blah", 0: "113125"})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_na_values_na_filter_override(self):
|
|
data = """\
|
|
A,B
|
|
1,A
|
|
nan,B
|
|
3,C
|
|
"""
|
|
|
|
expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']],
|
|
columns=['A', 'B'])
|
|
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']],
|
|
columns=['A', 'B'])
|
|
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_na_trailing_columns(self):
|
|
data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
|
|
2012-03-14,USD,AAPL,BUY,1000
|
|
2012-05-12,USD,SBUX,SELL,500"""
|
|
|
|
result = self.read_csv(StringIO(data))
|
|
assert result['Date'][1] == '2012-05-12'
|
|
assert result['UnitPrice'].isna().all()
|
|
|
|
def test_na_values_scalar(self):
|
|
# see gh-12224
|
|
names = ['a', 'b']
|
|
data = '1,2\n2,1'
|
|
|
|
expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]],
|
|
columns=names)
|
|
out = self.read_csv(StringIO(data), names=names, na_values=1)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]],
|
|
columns=names)
|
|
out = self.read_csv(StringIO(data), names=names,
|
|
na_values={'a': 2, 'b': 1})
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_na_values_dict_aliasing(self):
|
|
na_values = {'a': 2, 'b': 1}
|
|
na_values_copy = na_values.copy()
|
|
|
|
names = ['a', 'b']
|
|
data = '1,2\n2,1'
|
|
|
|
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
|
|
out = self.read_csv(StringIO(data), names=names, na_values=na_values)
|
|
|
|
tm.assert_frame_equal(out, expected)
|
|
tm.assert_dict_equal(na_values, na_values_copy)
|
|
|
|
def test_na_values_dict_col_index(self):
|
|
# see gh-14203
|
|
|
|
data = 'a\nfoo\n1'
|
|
na_values = {0: 'foo'}
|
|
|
|
out = self.read_csv(StringIO(data), na_values=na_values)
|
|
expected = DataFrame({'a': [np.nan, 1]})
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_na_values_uint64(self):
|
|
# see gh-14983
|
|
|
|
na_values = [2**63]
|
|
data = str(2**63) + '\n' + str(2**63 + 1)
|
|
expected = DataFrame([str(2**63), str(2**63 + 1)])
|
|
out = self.read_csv(StringIO(data), header=None, na_values=na_values)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
data = str(2**63) + ',1' + '\n,2'
|
|
expected = DataFrame([[str(2**63), 1], ['', 2]])
|
|
out = self.read_csv(StringIO(data), header=None)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_empty_na_values_no_default_with_index(self):
|
|
# see gh-15835
|
|
data = "a,1\nb,2"
|
|
|
|
expected = DataFrame({'1': [2]}, index=Index(["b"], name="a"))
|
|
out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0)
|
|
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_no_na_filter_on_index(self):
|
|
# see gh-5239
|
|
data = "a,b,c\n1,,3\n4,5,6"
|
|
|
|
# Don't parse NA-values in index when na_filter=False.
|
|
out = self.read_csv(StringIO(data), index_col=[1], na_filter=False)
|
|
|
|
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
|
|
index=Index(["", "5"], name="b"))
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
# Parse NA-values in index when na_filter=True.
|
|
out = self.read_csv(StringIO(data), index_col=[1], na_filter=True)
|
|
|
|
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
|
|
index=Index([np.nan, 5.0], name="b"))
|
|
tm.assert_frame_equal(out, expected)
|