laywerrobot/lib/python3.6/site-packages/pandas/tests/io/parser/na_values.py

372 lines
12 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# -*- coding: utf-8 -*-
"""
Tests that NA values are properly handled during
parsing for all of the parsers defined in parsers.py
"""
import numpy as np
from numpy import nan
import pandas.io.common as com
import pandas.util.testing as tm
from pandas import DataFrame, Index, MultiIndex
from pandas.compat import StringIO, range
class NAvaluesTests(object):
def test_string_nas(self):
data = """A,B,C
a,b,c
d,,f
,g,h
"""
result = self.read_csv(StringIO(data))
expected = DataFrame([['a', 'b', 'c'],
['d', np.nan, 'f'],
[np.nan, 'g', 'h']],
columns=['A', 'B', 'C'])
tm.assert_frame_equal(result, expected)
def test_detect_string_na(self):
data = """A,B
foo,bar
NA,baz
NaN,nan
"""
expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]],
dtype=np.object_)
df = self.read_csv(StringIO(data))
tm.assert_numpy_array_equal(df.values, expected)
def test_non_string_na_values(self):
# see gh-3611: with an odd float format, we can't match
# the string '999.0' exactly but still need float matching
nice = """A,B
-999,1.2
2,-999
3,4.5
"""
ugly = """A,B
-999,1.200
2,-999.000
3,4.500
"""
na_values_param = [['-999.0', '-999'],
[-999, -999.0],
[-999.0, -999],
['-999.0'], ['-999'],
[-999.0], [-999]]
expected = DataFrame([[np.nan, 1.2], [2.0, np.nan],
[3.0, 4.5]], columns=['A', 'B'])
for data in (nice, ugly):
for na_values in na_values_param:
out = self.read_csv(StringIO(data), na_values=na_values)
tm.assert_frame_equal(out, expected)
def test_default_na_values(self):
_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
'#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null',
'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', ''])
assert _NA_VALUES == com._NA_VALUES
nv = len(_NA_VALUES)
def f(i, v):
if i == 0:
buf = ''
elif i > 0:
buf = ''.join([','] * i)
buf = "{0}{1}".format(buf, v)
if i < nv - 1:
buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1)))
return buf
data = StringIO('\n'.join(f(i, v) for i, v in enumerate(_NA_VALUES)))
expected = DataFrame(np.nan, columns=range(nv), index=range(nv))
df = self.read_csv(data, header=None)
tm.assert_frame_equal(df, expected)
def test_custom_na_values(self):
data = """A,B,C
ignore,this,row
1,NA,3
-1.#IND,5,baz
7,8,NaN
"""
expected = np.array([[1., nan, 3],
[nan, 5, nan],
[7, 8, nan]])
df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1])
tm.assert_numpy_array_equal(df.values, expected)
df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'],
skiprows=[1])
tm.assert_numpy_array_equal(df2.values, expected)
df3 = self.read_table(StringIO(data), sep=',', na_values='baz',
skiprows=[1])
tm.assert_numpy_array_equal(df3.values, expected)
def test_bool_na_values(self):
data = """A,B,C
True,False,True
NA,True,False
False,NA,True"""
result = self.read_csv(StringIO(data))
expected = DataFrame({'A': np.array([True, nan, False], dtype=object),
'B': np.array([False, True, nan], dtype=object),
'C': [True, False, True]})
tm.assert_frame_equal(result, expected)
def test_na_value_dict(self):
data = """A,B,C
foo,bar,NA
bar,foo,foo
foo,bar,NA
bar,foo,foo"""
df = self.read_csv(StringIO(data),
na_values={'A': ['foo'], 'B': ['bar']})
expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'],
'B': [np.nan, 'foo', np.nan, 'foo'],
'C': [np.nan, 'foo', np.nan, 'foo']})
tm.assert_frame_equal(df, expected)
data = """\
a,b,c,d
0,NA,1,5
"""
xp = DataFrame({'b': [np.nan], 'c': [1], 'd': [5]}, index=[0])
xp.index.name = 'a'
df = self.read_csv(StringIO(data), na_values={}, index_col=0)
tm.assert_frame_equal(df, xp)
xp = DataFrame({'b': [np.nan], 'd': [5]},
MultiIndex.from_tuples([(0, 1)]))
xp.index.names = ['a', 'c']
df = self.read_csv(StringIO(data), na_values={}, index_col=[0, 2])
tm.assert_frame_equal(df, xp)
xp = DataFrame({'b': [np.nan], 'd': [5]},
MultiIndex.from_tuples([(0, 1)]))
xp.index.names = ['a', 'c']
df = self.read_csv(StringIO(data), na_values={}, index_col=['a', 'c'])
tm.assert_frame_equal(df, xp)
def test_na_values_keep_default(self):
data = """\
One,Two,Three
a,1,one
b,2,two
,3,three
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
df = self.read_csv(StringIO(data))
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', np.nan, 'five',
np.nan, 'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []},
keep_default_na=False)
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', 'nan', 'five',
'', 'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
df = self.read_csv(
StringIO(data), na_values=['a'], keep_default_na=False)
xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', 'nan', 'five', '',
'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []})
xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['one', 'two', 'three', np.nan, 'five',
np.nan, 'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
# see gh-4318: passing na_values=None and
# keep_default_na=False yields 'None' as a na_value
data = """\
One,Two,Three
a,1,None
b,2,two
,3,None
d,4,nan
e,5,five
nan,6,
g,7,seven
"""
df = self.read_csv(
StringIO(data), keep_default_na=False)
xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'],
'Two': [1, 2, 3, 4, 5, 6, 7],
'Three': ['None', 'two', 'None', 'nan', 'five', '',
'seven']})
tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
def test_no_keep_default_na_dict_na_values(self):
# see gh-19227
data = "a,b\n,2"
df = self.read_csv(StringIO(data), na_values={"b": ["2"]},
keep_default_na=False)
expected = DataFrame({"a": [""], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
# Scalar values shouldn't cause the parsing to crash or fail.
data = "a,b\n1,2"
df = self.read_csv(StringIO(data), na_values={"b": 2},
keep_default_na=False)
expected = DataFrame({"a": [1], "b": [np.nan]})
tm.assert_frame_equal(df, expected)
data = """\
113125,"blah","/blaha",kjsdkj,412.166,225.874,214.008
729639,"qwer","",asdfkj,466.681,,252.373
"""
expected = DataFrame({0: [np.nan, 729639.0],
1: [np.nan, "qwer"],
2: ["/blaha", np.nan],
3: ["kjsdkj", "asdfkj"],
4: [412.166, 466.681],
5: ["225.874", ""],
6: [np.nan, 252.373]})
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
na_values={2: "", 6: "214.008",
1: "blah", 0: 113125})
tm.assert_frame_equal(df, expected)
df = self.read_csv(StringIO(data), header=None, keep_default_na=False,
na_values={2: "", 6: "214.008",
1: "blah", 0: "113125"})
tm.assert_frame_equal(df, expected)
def test_na_values_na_filter_override(self):
data = """\
A,B
1,A
nan,B
3,C
"""
expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']],
columns=['A', 'B'])
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True)
tm.assert_frame_equal(out, expected)
expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']],
columns=['A', 'B'])
out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False)
tm.assert_frame_equal(out, expected)
def test_na_trailing_columns(self):
data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax
2012-03-14,USD,AAPL,BUY,1000
2012-05-12,USD,SBUX,SELL,500"""
result = self.read_csv(StringIO(data))
assert result['Date'][1] == '2012-05-12'
assert result['UnitPrice'].isna().all()
def test_na_values_scalar(self):
# see gh-12224
names = ['a', 'b']
data = '1,2\n2,1'
expected = DataFrame([[np.nan, 2.0], [2.0, np.nan]],
columns=names)
out = self.read_csv(StringIO(data), names=names, na_values=1)
tm.assert_frame_equal(out, expected)
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]],
columns=names)
out = self.read_csv(StringIO(data), names=names,
na_values={'a': 2, 'b': 1})
tm.assert_frame_equal(out, expected)
def test_na_values_dict_aliasing(self):
na_values = {'a': 2, 'b': 1}
na_values_copy = na_values.copy()
names = ['a', 'b']
data = '1,2\n2,1'
expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names)
out = self.read_csv(StringIO(data), names=names, na_values=na_values)
tm.assert_frame_equal(out, expected)
tm.assert_dict_equal(na_values, na_values_copy)
def test_na_values_dict_col_index(self):
# see gh-14203
data = 'a\nfoo\n1'
na_values = {0: 'foo'}
out = self.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({'a': [np.nan, 1]})
tm.assert_frame_equal(out, expected)
def test_na_values_uint64(self):
# see gh-14983
na_values = [2**63]
data = str(2**63) + '\n' + str(2**63 + 1)
expected = DataFrame([str(2**63), str(2**63 + 1)])
out = self.read_csv(StringIO(data), header=None, na_values=na_values)
tm.assert_frame_equal(out, expected)
data = str(2**63) + ',1' + '\n,2'
expected = DataFrame([[str(2**63), 1], ['', 2]])
out = self.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(out, expected)
def test_empty_na_values_no_default_with_index(self):
# see gh-15835
data = "a,1\nb,2"
expected = DataFrame({'1': [2]}, index=Index(["b"], name="a"))
out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0)
tm.assert_frame_equal(out, expected)
def test_no_na_filter_on_index(self):
# see gh-5239
data = "a,b,c\n1,,3\n4,5,6"
# Don't parse NA-values in index when na_filter=False.
out = self.read_csv(StringIO(data), index_col=[1], na_filter=False)
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
index=Index(["", "5"], name="b"))
tm.assert_frame_equal(out, expected)
# Parse NA-values in index when na_filter=True.
out = self.read_csv(StringIO(data), index_col=[1], na_filter=True)
expected = DataFrame({"a": [1, 4], "c": [3, 6]},
index=Index([np.nan, 5.0], name="b"))
tm.assert_frame_equal(out, expected)