# -*- coding: utf-8 -*- """ Tests date parsing functionality for all of the parsers defined in parsers.py """ from distutils.version import LooseVersion from datetime import datetime, date import pytest import numpy as np from pandas._libs.tslibs import parsing from pandas._libs.tslib import Timestamp import pandas as pd import pandas.io.parsers as parsers import pandas.core.tools.datetimes as tools import pandas.util.testing as tm import pandas.io.date_converters as conv from pandas import DataFrame, Series, Index, DatetimeIndex, MultiIndex from pandas import compat from pandas.compat import parse_date, StringIO, lrange from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.indexes.datetimes import date_range class ParseDatesTests(object): def test_separator_date_conflict(self): # Regression test for gh-4678: make sure thousands separator and # date parsing do not conflict. data = '06-02-2013;13:00;1-000.215' expected = DataFrame( [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=['Date', 2] ) df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None) tm.assert_frame_equal(df, expected) def test_multiple_date_col(self): # Can use multiple date parsers data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ def func(*date_cols): res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) return res df = self.read_csv(StringIO(data), header=None, date_parser=func, prefix='X', parse_dates={'nominal': [1, 2], 'actual': [1, 3]}) assert 'nominal' in df assert 'actual' in df assert 'X1' not in df assert 'X2' not in df assert 'X3' not in df d = datetime(1999, 1, 27, 19, 0) assert df.loc[0, 'nominal'] == d df = self.read_csv(StringIO(data), header=None, date_parser=func, parse_dates={'nominal': [1, 2], 'actual': [1, 3]}, keep_date_col=True) assert 'nominal' in df assert 'actual' in df assert 1 in df assert 2 in df assert 3 in df data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ df = self.read_csv(StringIO(data), header=None, prefix='X', parse_dates=[[1, 2], [1, 3]]) assert 'X1_X2' in df assert 'X1_X3' in df assert 'X1' not in df assert 'X2' not in df assert 'X3' not in df d = datetime(1999, 1, 27, 19, 0) assert df.loc[0, 'X1_X2'] == d df = self.read_csv(StringIO(data), header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True) assert '1_2' in df assert '1_3' in df assert 1 in df assert 2 in df assert 3 in df data = '''\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 ''' df = self.read_csv(StringIO(data), sep=',', header=None, parse_dates=[1], index_col=1) d = datetime(1999, 1, 27, 19, 0) assert df.index[0] == d def test_multiple_date_cols_int_cast(self): data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" "KORD,19990127, 23:00:00, 22:56:00, -0.5900") date_spec = {'nominal': [1, 2], 'actual': [1, 3]} import pandas.io.date_converters as conv # it works! df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, date_parser=conv.parse_date_time) assert 'nominal' in df def test_multiple_date_col_timestamp_parse(self): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" result = self.read_csv(StringIO(data), sep=',', header=None, parse_dates=[[0, 1]], date_parser=Timestamp) ex_val = Timestamp('05/31/2012 15:30:00.029') assert result['0_1'][0] == ex_val def test_multiple_date_cols_with_header(self): data = """\ ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) assert not isinstance(df.nominal[0], compat.string_types) ts_data = """\ ID,date,nominalTime,actualTime,A,B,C,D,E KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ def test_multiple_date_col_name_collision(self): with pytest.raises(ValueError): self.read_csv(StringIO(self.ts_data), parse_dates={'ID': [1, 2]}) data = """\ date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa with pytest.raises(ValueError): self.read_csv(StringIO(data), parse_dates=[[1, 2]]) def test_date_parser_int_bug(self): # See gh-3071 log_file = StringIO( 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,' 'accountid,userid,contactid,level,silo,method\n' '1343103150,0.062353,0,4,6,0.01690,3,' '12345,1,-1,3,invoice_InvoiceResource,search\n' ) def f(posix_string): return datetime.utcfromtimestamp(int(posix_string)) # it works! self.read_csv(log_file, index_col=0, parse_dates=[0], date_parser=f) def test_nat_parse(self): # See gh-3062 df = DataFrame(dict({ 'A': np.asarray(lrange(10), dtype='float64'), 'B': pd.Timestamp('20010101')})) df.iloc[3:6, :] = np.nan with tm.ensure_clean('__nat_parse_.csv') as path: df.to_csv(path) result = self.read_csv(path, index_col=0, parse_dates=['B']) tm.assert_frame_equal(result, df) expected = Series(dict(A='float64', B='datetime64[ns]')) tm.assert_series_equal(expected, result.dtypes) # test with NaT for the nan_rep # we don't have a method to specify the Datetime na_rep # (it defaults to '') df.to_csv(path) result = self.read_csv(path, index_col=0, parse_dates=['B']) tm.assert_frame_equal(result, df) def test_csv_custom_parser(self): data = """A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ f = lambda x: datetime.strptime(x, '%Y%m%d') df = self.read_csv(StringIO(data), date_parser=f) expected = self.read_csv(StringIO(data), parse_dates=True) tm.assert_frame_equal(df, expected) def test_parse_dates_implicit_first_col(self): data = """A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ df = self.read_csv(StringIO(data), parse_dates=True) expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) assert isinstance( df.index[0], (datetime, np.datetime64, Timestamp)) tm.assert_frame_equal(df, expected) def test_parse_dates_string(self): data = """date,A,B,C 20090101,a,1,2 20090102,b,3,4 20090103,c,4,5 """ rs = self.read_csv( StringIO(data), index_col='date', parse_dates=['date']) idx = date_range('1/1/2009', periods=3) idx.name = 'date' xp = DataFrame({'A': ['a', 'b', 'c'], 'B': [1, 3, 4], 'C': [2, 4, 5]}, idx) tm.assert_frame_equal(rs, xp) def test_yy_format_with_yearfirst(self): data = """date,time,B,C 090131,0010,1,2 090228,1020,3,4 090331,0830,5,6 """ # See gh-217 import dateutil if LooseVersion(dateutil.__version__) >= LooseVersion('2.5.0'): pytest.skip("testing yearfirst=True not-support" "on datetutil < 2.5.0 this works but" "is wrong") rs = self.read_csv(StringIO(data), index_col=0, parse_dates=[['date', 'time']]) idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), datetime(2009, 2, 28, 10, 20, 0), datetime(2009, 3, 31, 8, 30, 0)], dtype=object, name='date_time') xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) tm.assert_frame_equal(rs, xp) rs = self.read_csv(StringIO(data), index_col=0, parse_dates=[[0, 1]]) idx = DatetimeIndex([datetime(2009, 1, 31, 0, 10, 0), datetime(2009, 2, 28, 10, 20, 0), datetime(2009, 3, 31, 8, 30, 0)], dtype=object, name='date_time') xp = DataFrame({'B': [1, 3, 5], 'C': [2, 4, 6]}, idx) tm.assert_frame_equal(rs, xp) def test_parse_dates_column_list(self): data = 'a,b,c\n01/01/2010,1,15/02/2010' expected = DataFrame({'a': [datetime(2010, 1, 1)], 'b': [1], 'c': [datetime(2010, 2, 15)]}) expected = expected.set_index(['a', 'b']) df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=[0, 2], dayfirst=True) tm.assert_frame_equal(df, expected) df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=['a', 'c'], dayfirst=True) tm.assert_frame_equal(df, expected) def test_multi_index_parse_dates(self): data = """index1,index2,A,B,C 20090101,one,a,1,2 20090101,two,b,3,4 20090101,three,c,4,5 20090102,one,a,1,2 20090102,two,b,3,4 20090102,three,c,4,5 20090103,one,a,1,2 20090103,two,b,3,4 20090103,three,c,4,5 """ df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) assert isinstance(df.index.levels[0][0], (datetime, np.datetime64, Timestamp)) # specify columns out of order! df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) assert isinstance(df2.index.levels[1][0], (datetime, np.datetime64, Timestamp)) def test_parse_dates_custom_euroformat(self): text = """foo,bar,baz 31/01/2010,1,2 01/02/2010,1,NA 02/02/2010,1,2 """ parser = lambda d: parse_date(d, dayfirst=True) df = self.read_csv(StringIO(text), names=['time', 'Q', 'NTU'], header=0, index_col=0, parse_dates=True, date_parser=parser, na_values=['NA']) exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), datetime(2010, 2, 2)], name='time') expected = DataFrame({'Q': [1, 1, 1], 'NTU': [2, np.nan, 2]}, index=exp_index, columns=['Q', 'NTU']) tm.assert_frame_equal(df, expected) parser = lambda d: parse_date(d, day_first=True) pytest.raises(TypeError, self.read_csv, StringIO(text), skiprows=[0], names=['time', 'Q', 'NTU'], index_col=0, parse_dates=True, date_parser=parser, na_values=['NA']) def test_parse_tz_aware(self): # See gh-1693 import pytz data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5") # it works result = self.read_csv(data, index_col=0, parse_dates=True) stamp = result.index[0] assert stamp.minute == 39 try: assert result.index.tz is pytz.utc except AssertionError: # hello Yaroslav arr = result.index.to_pydatetime() result = tools.to_datetime(arr, utc=True)[0] assert stamp.minute == result.minute assert stamp.hour == result.hour assert stamp.day == result.day def test_multiple_date_cols_index(self): data = """ ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ xp = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, index_col='nominal') tm.assert_frame_equal(xp.set_index('nominal'), df) df2 = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, index_col=0) tm.assert_frame_equal(df2, df) df3 = self.read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0) tm.assert_frame_equal(df3, df, check_names=False) def test_multiple_date_cols_chunked(self): df = self.read_csv(StringIO(self.ts_data), parse_dates={ 'nominal': [1, 2]}, index_col='nominal') reader = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1, 2]}, index_col='nominal', chunksize=2) chunks = list(reader) assert 'nominalTime' not in df tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) def test_multiple_date_col_named_components(self): xp = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1, 2]}, index_col='nominal') colspec = {'nominal': ['date', 'nominalTime']} df = self.read_csv(StringIO(self.ts_data), parse_dates=colspec, index_col='nominal') tm.assert_frame_equal(df, xp) def test_multiple_date_col_multiple_index(self): df = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1, 2]}, index_col=['nominal', 'ID']) xp = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1, 2]}) tm.assert_frame_equal(xp.set_index(['nominal', 'ID']), df) def test_read_with_parse_dates_scalar_non_bool(self): # See gh-5636 errmsg = ("Only booleans, lists, and " "dictionaries are accepted " "for the 'parse_dates' parameter") data = """A,B,C 1,2,2003-11-1""" tm.assert_raises_regex(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates="C") tm.assert_raises_regex(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates="C", index_col="C") def test_read_with_parse_dates_invalid_type(self): errmsg = ("Only booleans, lists, and " "dictionaries are accepted " "for the 'parse_dates' parameter") data = """A,B,C 1,2,2003-11-1""" tm.assert_raises_regex(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates=(1,)) tm.assert_raises_regex(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates=np.array([4, 5])) tm.assert_raises_regex(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates=set([1, 3, 3])) def test_parse_dates_empty_string(self): # see gh-2263 data = "Date, test\n2012-01-01, 1\n,2" result = self.read_csv(StringIO(data), parse_dates=["Date"], na_filter=False) assert result['Date'].isna()[1] def test_parse_dates_noconvert_thousands(self): # see gh-14066 data = 'a\n04.15.2016' expected = DataFrame([datetime(2016, 4, 15)], columns=['a']) result = self.read_csv(StringIO(data), parse_dates=['a'], thousands='.') tm.assert_frame_equal(result, expected) exp_index = DatetimeIndex(['2016-04-15'], name='a') expected = DataFrame(index=exp_index) result = self.read_csv(StringIO(data), index_col=0, parse_dates=True, thousands='.') tm.assert_frame_equal(result, expected) data = 'a,b\n04.15.2016,09.16.2013' expected = DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=['a', 'b']) result = self.read_csv(StringIO(data), parse_dates=['a', 'b'], thousands='.') tm.assert_frame_equal(result, expected) expected = DataFrame([[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=['a', 'b']) expected = expected.set_index(['a', 'b']) result = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True, thousands='.') tm.assert_frame_equal(result, expected) def test_parse_date_time_multi_level_column_name(self): data = """\ D,T,A,B date, time,a,b 2001-01-05, 09:00:00, 0.0, 10. 2001-01-06, 00:00:00, 1.0, 11. """ datecols = {'date_time': [0, 1]} result = self.read_csv(StringIO(data), sep=',', header=[0, 1], parse_dates=datecols, date_parser=conv.parse_date_time) expected_data = [[datetime(2001, 1, 5, 9, 0, 0), 0., 10.], [datetime(2001, 1, 6, 0, 0, 0), 1., 11.]] expected = DataFrame(expected_data, columns=['date_time', ('A', 'a'), ('B', 'b')]) tm.assert_frame_equal(result, expected) def test_parse_date_time(self): dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) times = np.array(['05:07:09', '06:08:00'], dtype=object) expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) result = conv.parse_date_time(dates, times) assert (result == expected).all() data = """\ date, time, a, b 2001-01-05, 10:00:00, 0.0, 10. 2001-01-05, 00:00:00, 1., 11. """ datecols = {'date_time': [0, 1]} df = self.read_csv(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_time) assert 'date_time' in df assert df.date_time.loc[0] == datetime(2001, 1, 5, 10, 0, 0) data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" "KORD,19990127, 23:00:00, 22:56:00, -0.5900") date_spec = {'nominal': [1, 2], 'actual': [1, 3]} df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, date_parser=conv.parse_date_time) def test_parse_date_fields(self): years = np.array([2007, 2008]) months = np.array([1, 2]) days = np.array([3, 4]) result = conv.parse_date_fields(years, months, days) expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) assert (result == expected).all() data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n" "2001 , 02 , 1 , 11.") datecols = {'ymd': [0, 1, 2]} df = self.read_csv(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_fields) assert 'ymd' in df assert df.ymd.loc[0] == datetime(2001, 1, 10) def test_datetime_six_col(self): years = np.array([2007, 2008]) months = np.array([1, 2]) days = np.array([3, 4]) hours = np.array([5, 6]) minutes = np.array([7, 8]) seconds = np.array([9, 0]) expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) result = conv.parse_all_fields(years, months, days, hours, minutes, seconds) assert (result == expected).all() data = """\ year, month, day, hour, minute, second, a, b 2001, 01, 05, 10, 00, 0, 0.0, 10. 2001, 01, 5, 10, 0, 00, 1., 11. """ datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} df = self.read_csv(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) assert 'ymdHMS' in df assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0) def test_datetime_fractional_seconds(self): data = """\ year, month, day, hour, minute, second, a, b 2001, 01, 05, 10, 00, 0.123456, 0.0, 10. 2001, 01, 5, 10, 0, 0.500000, 1., 11. """ datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} df = self.read_csv(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) assert 'ymdHMS' in df assert df.ymdHMS.loc[0] == datetime(2001, 1, 5, 10, 0, 0, microsecond=123456) assert df.ymdHMS.loc[1] == datetime(2001, 1, 5, 10, 0, 0, microsecond=500000) def test_generic(self): data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." datecols = {'ym': [0, 1]} dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) df = self.read_csv(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=dateconverter) assert 'ym' in df assert df.ym.loc[0] == date(2001, 1, 1) def test_dateparser_resolution_if_not_ns(self): # GH 10245 data = """\ date,time,prn,rxstatus 2013-11-03,19:00:00,126,00E80000 2013-11-03,19:00:00,23,00E80000 2013-11-03,19:00:00,13,00E80000 """ def date_parser(date, time): datetime = np_array_datetime64_compat( date + 'T' + time + 'Z', dtype='datetime64[s]') return datetime df = self.read_csv(StringIO(data), date_parser=date_parser, parse_dates={'datetime': ['date', 'time']}, index_col=['datetime', 'prn']) datetimes = np_array_datetime64_compat(['2013-11-03T19:00:00Z'] * 3, dtype='datetime64[s]') df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, index=MultiIndex.from_tuples( [(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)], names=['datetime', 'prn'])) tm.assert_frame_equal(df, df_correct) def test_parse_date_column_with_empty_string(self): # GH 6428 data = """case,opdate 7,10/18/2006 7,10/18/2008 621, """ result = self.read_csv(StringIO(data), parse_dates=['opdate']) expected_data = [[7, '10/18/2006'], [7, '10/18/2008'], [621, ' ']] expected = DataFrame(expected_data, columns=['case', 'opdate']) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("data,expected", [ ("a\n135217135789158401\n1352171357E+5", DataFrame({"a": [135217135789158401, 135217135700000]}, dtype="float64")), ("a\n99999999999\n123456789012345\n1234E+0", DataFrame({"a": [99999999999, 123456789012345, 1234]}, dtype="float64")) ]) @pytest.mark.parametrize("parse_dates", [True, False]) def test_parse_date_float(self, data, expected, parse_dates): # see gh-2697 # # Date parsing should fail, so we leave the data untouched # (i.e. float precision should remain unchanged). result = self.read_csv(StringIO(data), parse_dates=parse_dates) tm.assert_frame_equal(result, expected)