153 lines
4.8 KiB
Python
153 lines
4.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
Tests column conversion functionality during parsing
|
|
for all of the parsers defined in parsers.py
|
|
"""
|
|
|
|
from datetime import datetime
|
|
|
|
import pytest
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pandas.util.testing as tm
|
|
|
|
from pandas._libs.tslib import Timestamp
|
|
from pandas import DataFrame, Index
|
|
from pandas.compat import parse_date, StringIO, lmap
|
|
|
|
|
|
class ConverterTests(object):
|
|
|
|
def test_converters_type_must_be_dict(self):
|
|
data = """index,A,B,C,D
|
|
foo,2,3,4,5
|
|
"""
|
|
with tm.assert_raises_regex(TypeError, 'Type converters.+'):
|
|
self.read_csv(StringIO(data), converters=0)
|
|
|
|
def test_converters(self):
|
|
data = """A,B,C,D
|
|
a,1,2,01/01/2009
|
|
b,3,4,01/02/2009
|
|
c,4,5,01/03/2009
|
|
"""
|
|
result = self.read_csv(StringIO(data), converters={'D': parse_date})
|
|
result2 = self.read_csv(StringIO(data), converters={3: parse_date})
|
|
|
|
expected = self.read_csv(StringIO(data))
|
|
expected['D'] = expected['D'].map(parse_date)
|
|
|
|
assert isinstance(result['D'][0], (datetime, Timestamp))
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_frame_equal(result2, expected)
|
|
|
|
# produce integer
|
|
converter = lambda x: int(x.split('/')[2])
|
|
result = self.read_csv(StringIO(data), converters={'D': converter})
|
|
expected = self.read_csv(StringIO(data))
|
|
expected['D'] = expected['D'].map(converter)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_converters_no_implicit_conv(self):
|
|
# see gh-2184
|
|
data = """000102,1.2,A\n001245,2,B"""
|
|
f = lambda x: x.strip()
|
|
converter = {0: f}
|
|
df = self.read_csv(StringIO(data), header=None, converters=converter)
|
|
assert df[0].dtype == object
|
|
|
|
def test_converters_euro_decimal_format(self):
|
|
data = """Id;Number1;Number2;Text1;Text2;Number3
|
|
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
|
2;121,12;14897,76;DEF;uyt;0,377320872
|
|
3;878,158;108013,434;GHI;rez;2,735694704"""
|
|
f = lambda x: float(x.replace(",", "."))
|
|
converter = {'Number1': f, 'Number2': f, 'Number3': f}
|
|
df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
|
|
assert df2['Number1'].dtype == float
|
|
assert df2['Number2'].dtype == float
|
|
assert df2['Number3'].dtype == float
|
|
|
|
def test_converter_return_string_bug(self):
|
|
# see gh-583
|
|
data = """Id;Number1;Number2;Text1;Text2;Number3
|
|
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
|
2;121,12;14897,76;DEF;uyt;0,377320872
|
|
3;878,158;108013,434;GHI;rez;2,735694704"""
|
|
f = lambda x: float(x.replace(",", "."))
|
|
converter = {'Number1': f, 'Number2': f, 'Number3': f}
|
|
df2 = self.read_csv(StringIO(data), sep=';', converters=converter)
|
|
assert df2['Number1'].dtype == float
|
|
|
|
def test_converters_corner_with_nas(self):
|
|
# skip aberration observed on Win64 Python 3.2.2
|
|
if hash(np.int64(-1)) != -2:
|
|
pytest.skip("skipping because of windows hash on Python"
|
|
" 3.2.2")
|
|
|
|
data = """id,score,days
|
|
1,2,12
|
|
2,2-5,
|
|
3,,14+
|
|
4,6-12,2"""
|
|
|
|
def convert_days(x):
|
|
x = x.strip()
|
|
if not x:
|
|
return np.nan
|
|
|
|
is_plus = x.endswith('+')
|
|
if is_plus:
|
|
x = int(x[:-1]) + 1
|
|
else:
|
|
x = int(x)
|
|
return x
|
|
|
|
def convert_days_sentinel(x):
|
|
x = x.strip()
|
|
if not x:
|
|
return np.nan
|
|
|
|
is_plus = x.endswith('+')
|
|
if is_plus:
|
|
x = int(x[:-1]) + 1
|
|
else:
|
|
x = int(x)
|
|
return x
|
|
|
|
def convert_score(x):
|
|
x = x.strip()
|
|
if not x:
|
|
return np.nan
|
|
if x.find('-') > 0:
|
|
valmin, valmax = lmap(int, x.split('-'))
|
|
val = 0.5 * (valmin + valmax)
|
|
else:
|
|
val = float(x)
|
|
|
|
return val
|
|
|
|
fh = StringIO(data)
|
|
result = self.read_csv(fh, converters={'score': convert_score,
|
|
'days': convert_days},
|
|
na_values=['', None])
|
|
assert pd.isna(result['days'][1])
|
|
|
|
fh = StringIO(data)
|
|
result2 = self.read_csv(fh, converters={'score': convert_score,
|
|
'days': convert_days_sentinel},
|
|
na_values=['', None])
|
|
tm.assert_frame_equal(result, result2)
|
|
|
|
def test_converter_index_col_bug(self):
|
|
# see gh-1835
|
|
data = "A;B\n1;2\n3;4"
|
|
|
|
rs = self.read_csv(StringIO(data), sep=';', index_col='A',
|
|
converters={'A': lambda x: x})
|
|
|
|
xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A'))
|
|
tm.assert_frame_equal(rs, xp)
|
|
assert rs.index.name == xp.index.name
|