# -*- coding: utf-8 -*- import csv import os import platform import codecs import re import sys from datetime import datetime from collections import OrderedDict import pytest import numpy as np from pandas._libs.tslib import Timestamp import pandas as pd import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, BytesIO, PY3, range, lrange, u) from pandas.errors import DtypeWarning, EmptyDataError, ParserError from pandas.io.common import URLError from pandas.io.parsers import TextFileReader, TextParser class ParserTests(object): """ Want to be able to test either C+Cython or Python+Cython parsers """ data1 = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 """ def test_empty_decimal_marker(self): data = """A|B|C 1|2,334|5 10|13|10. """ # Parsers support only length-1 decimals msg = 'Only length-1 decimal markers supported' with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(data), decimal='') def test_bad_stream_exception(self): # Issue 13652: # This test validates that both python engine # and C engine will raise UnicodeDecodeError instead of # c engine raising ParserError and swallowing exception # that caused read to fail. codec = codecs.lookup("utf-8") utf8 = codecs.lookup('utf-8') if compat.PY3: msg = "'utf-8' codec can't decode byte" else: msg = "'utf8' codec can't decode byte" # stream must be binary UTF8 with open(self.csv_shiftjs, "rb") as handle, codecs.StreamRecoder( handle, utf8.encode, utf8.decode, codec.streamreader, codec.streamwriter) as stream: with tm.assert_raises_regex(UnicodeDecodeError, msg): self.read_csv(stream) def test_read_csv(self): if not compat.PY3: if compat.is_platform_windows(): prefix = u("file:///") else: prefix = u("file://") fname = prefix + compat.text_type(os.path.abspath(self.csv1)) self.read_csv(fname, index_col=0, parse_dates=True) def test_1000_sep(self): data = """A|B|C 1|2,334|5 10|13|10. """ expected = DataFrame({ 'A': [1, 10], 'B': [2334, 13], 'C': [5, 10.] }) df = self.read_csv(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected) def test_squeeze(self): data = """\ a,1 b,2 c,3 """ idx = Index(['a', 'b', 'c'], name=0) expected = Series([1, 2, 3], name=1, index=idx) result = self.read_table(StringIO(data), sep=',', index_col=0, header=None, squeeze=True) assert isinstance(result, Series) tm.assert_series_equal(result, expected) def test_squeeze_no_view(self): # see gh-8217 # Series should not be a view data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13""" result = self.read_csv(StringIO(data), index_col='time', squeeze=True) assert not result._is_view def test_malformed(self): # see gh-6607 # all data = """ignore A,B,C 1,2,3 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 4, saw 5' with tm.assert_raises_regex(Exception, msg): self.read_table(StringIO(data), sep=',', header=1, comment='#') # first chunk data = """ignore A,B,C skip 1,2,3 3,5,10 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 6, saw 5' with tm.assert_raises_regex(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, skiprows=[2]) it.read(5) # middle chunk data = """ignore A,B,C skip 1,2,3 3,5,10 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 6, saw 5' with tm.assert_raises_regex(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, skiprows=[2]) it.read(3) # last chunk data = """ignore A,B,C skip 1,2,3 3,5,10 # comment 1,2,3,4,5 2,3,4 """ msg = 'Expected 3 fields in line 6, saw 5' with tm.assert_raises_regex(Exception, msg): it = self.read_table(StringIO(data), sep=',', header=1, comment='#', iterator=True, chunksize=1, skiprows=[2]) it.read() # skipfooter is not supported with the C parser yet if self.engine == 'python': # skipfooter data = """ignore A,B,C 1,2,3 # comment 1,2,3,4,5 2,3,4 footer """ msg = 'Expected 3 fields in line 4, saw 5' with tm.assert_raises_regex(Exception, msg): self.read_table(StringIO(data), sep=',', header=1, comment='#', skipfooter=1) def test_quoting(self): bad_line_small = """printer\tresult\tvariant_name Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jacob Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jakob Klosterdruckerei\tKlosterdruckerei (1609-1805)\t"Furststiftische Hofdruckerei, (1609-1805)\tGaller, Alois Klosterdruckerei\tKlosterdruckerei (1609-1805)\tHochfurstliche Buchhandlung """ # noqa pytest.raises(Exception, self.read_table, StringIO(bad_line_small), sep='\t') good_line_small = bad_line_small + '"' df = self.read_table(StringIO(good_line_small), sep='\t') assert len(df) == 3 def test_unnamed_columns(self): data = """A,B,C,, 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ expected = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], dtype=np.int64) df = self.read_table(StringIO(data), sep=',') tm.assert_almost_equal(df.values, expected) tm.assert_index_equal(df.columns, Index(['A', 'B', 'C', 'Unnamed: 3', 'Unnamed: 4'])) def test_csv_mixed_type(self): data = """A,B,C a,1,2 b,3,4 c,4,5 """ expected = DataFrame({'A': ['a', 'b', 'c'], 'B': [1, 3, 4], 'C': [2, 4, 5]}) out = self.read_csv(StringIO(data)) tm.assert_frame_equal(out, expected) def test_read_csv_low_memory_no_rows_with_index(self): if self.engine == "c" and not self.low_memory: pytest.skip("This is a low-memory specific test") # see gh-21141 data = """A,B,C 1,1,1,2 2,2,3,4 3,3,4,5 """ out = self.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0) expected = DataFrame(columns=["A", "B", "C"]) tm.assert_frame_equal(out, expected) def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, parse_dates=True) tm.assert_index_equal(df.columns, pd.Index(['A', 'B', 'C', 'D'])) assert df.index.name == 'index' assert isinstance( df.index[0], (datetime, np.datetime64, Timestamp)) assert df.values.dtype == np.float64 tm.assert_frame_equal(df, df2) def test_read_csv_no_index_name(self): df = self.read_csv(self.csv2, index_col=0, parse_dates=True) df2 = self.read_table(self.csv2, sep=',', index_col=0, parse_dates=True) tm.assert_index_equal(df.columns, pd.Index(['A', 'B', 'C', 'D', 'E'])) assert isinstance(df.index[0], (datetime, np.datetime64, Timestamp)) assert df.loc[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64 tm.assert_frame_equal(df, df2) def test_read_table_unicode(self): fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8')) df1 = self.read_table(fin, sep=";", encoding="utf-8", header=None) assert isinstance(df1[0].values[0], compat.text_type) def test_read_table_wrong_num_columns(self): # too few! data = """A,B,C,D,E,F 1,2,3,4,5,6 6,7,8,9,10,11,12 11,12,13,14,15,16 """ pytest.raises(ValueError, self.read_csv, StringIO(data)) def test_read_duplicate_index_explicit(self): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo,12,13,14,15 bar,12,13,14,15 """ result = self.read_csv(StringIO(data), index_col=0) expected = self.read_csv(StringIO(data)).set_index( 'index', verify_integrity=False) tm.assert_frame_equal(result, expected) result = self.read_table(StringIO(data), sep=',', index_col=0) expected = self.read_table(StringIO(data), sep=',', ).set_index( 'index', verify_integrity=False) tm.assert_frame_equal(result, expected) def test_read_duplicate_index_implicit(self): data = """A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo,12,13,14,15 bar,12,13,14,15 """ # make sure an error isn't thrown self.read_csv(StringIO(data)) self.read_table(StringIO(data), sep=',') def test_parse_bools(self): data = """A,B True,1 False,2 True,3 """ data = self.read_csv(StringIO(data)) assert data['A'].dtype == np.bool_ data = """A,B YES,1 no,2 yes,3 No,3 Yes,3 """ data = self.read_csv(StringIO(data), true_values=['yes', 'Yes', 'YES'], false_values=['no', 'NO', 'No']) assert data['A'].dtype == np.bool_ data = """A,B TRUE,1 FALSE,2 TRUE,3 """ data = self.read_csv(StringIO(data)) assert data['A'].dtype == np.bool_ data = """A,B foo,bar bar,foo""" result = self.read_csv(StringIO(data), true_values=['foo'], false_values=['bar']) expected = DataFrame({'A': [True, False], 'B': [False, True]}) tm.assert_frame_equal(result, expected) def test_int_conversion(self): data = """A,B 1.0,1 2.0,2 3.0,3 """ data = self.read_csv(StringIO(data)) assert data['A'].dtype == np.float64 assert data['B'].dtype == np.int64 def test_read_nrows(self): expected = self.read_csv(StringIO(self.data1))[:3] df = self.read_csv(StringIO(self.data1), nrows=3) tm.assert_frame_equal(df, expected) # see gh-10476 df = self.read_csv(StringIO(self.data1), nrows=3.0) tm.assert_frame_equal(df, expected) msg = r"'nrows' must be an integer >=0" with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(self.data1), nrows=1.2) with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(self.data1), nrows='foo') with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(self.data1), nrows=-1) def test_read_chunksize(self): reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) chunks = list(reader) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) # with invalid chunksize value: msg = r"'chunksize' must be an integer >=1" with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(self.data1), chunksize=1.3) with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(self.data1), chunksize='foo') with tm.assert_raises_regex(ValueError, msg): self.read_csv(StringIO(self.data1), chunksize=0) def test_read_chunksize_and_nrows(self): # gh-15755 # With nrows reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2, nrows=5) df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) tm.assert_frame_equal(pd.concat(reader), df) # chunksize > nrows reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=8, nrows=5) df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) tm.assert_frame_equal(pd.concat(reader), df) # with changing "size": reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=8, nrows=5) df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) tm.assert_frame_equal(reader.get_chunk(size=2), df.iloc[:2]) tm.assert_frame_equal(reader.get_chunk(size=4), df.iloc[2:5]) with pytest.raises(StopIteration): reader.get_chunk(size=3) def test_read_chunksize_named(self): reader = self.read_csv( StringIO(self.data1), index_col='index', chunksize=2) df = self.read_csv(StringIO(self.data1), index_col='index') chunks = list(reader) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) def test_get_chunk_passed_chunksize(self): data = """A,B,C 1,2,3 4,5,6 7,8,9 1,2,3""" result = self.read_csv(StringIO(data), chunksize=2) piece = result.get_chunk() assert len(piece) == 2 def test_read_chunksize_generated_index(self): # GH 12185 reader = self.read_csv(StringIO(self.data1), chunksize=2) df = self.read_csv(StringIO(self.data1)) tm.assert_frame_equal(pd.concat(reader), df) reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0) df = self.read_csv(StringIO(self.data1), index_col=0) tm.assert_frame_equal(pd.concat(reader), df) def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', '4', '5', '6']] df = self.read_csv(StringIO(data), index_col=0) parser = TextParser(as_list, index_col=0, chunksize=2) chunk = parser.read(None) tm.assert_frame_equal(chunk, df) def test_iterator(self): # See gh-6607 reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True) df = self.read_csv(StringIO(self.data1), index_col=0) chunk = reader.read(3) tm.assert_frame_equal(chunk, df[:3]) last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[1:3]) treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) assert isinstance(treader, TextFileReader) # gh-3967: stopping iteration when chunksize is specified data = """A,B,C foo,1,2,3 bar,4,5,6 baz,7,8,9 """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) assert len(result) == 3 tm.assert_frame_equal(pd.concat(result), expected) # skipfooter is not supported with the C parser yet if self.engine == 'python': # test bad parameter (skipfooter) reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True, skipfooter=1) pytest.raises(ValueError, reader.read, 3) def test_pass_names_with_index(self): lines = self.data1.split('\n') no_header = '\n'.join(lines[1:]) # regular index names = ['index', 'A', 'B', 'C', 'D'] df = self.read_csv(StringIO(no_header), index_col=0, names=names) expected = self.read_csv(StringIO(self.data1), index_col=0) tm.assert_frame_equal(df, expected) # multi index data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 """ lines = data.split('\n') no_header = '\n'.join(lines[1:]) names = ['index1', 'index2', 'A', 'B', 'C', 'D'] df = self.read_csv(StringIO(no_header), index_col=[0, 1], names=names) expected = self.read_csv(StringIO(data), index_col=[0, 1]) tm.assert_frame_equal(df, expected) df = self.read_csv(StringIO(data), index_col=['index1', 'index2']) tm.assert_frame_equal(df, expected) def test_multi_index_no_level_names(self): data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 """ data2 = """A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 foo,three,12,13,14,15 bar,one,12,13,14,15 bar,two,12,13,14,15 """ lines = data.split('\n') no_header = '\n'.join(lines[1:]) names = ['A', 'B', 'C', 'D'] df = self.read_csv(StringIO(no_header), index_col=[0, 1], header=None, names=names) expected = self.read_csv(StringIO(data), index_col=[0, 1]) tm.assert_frame_equal(df, expected, check_names=False) # 2 implicit first cols df2 = self.read_csv(StringIO(data2)) tm.assert_frame_equal(df2, df) # reverse order of index df = self.read_csv(StringIO(no_header), index_col=[1, 0], names=names, header=None) expected = self.read_csv(StringIO(data), index_col=[1, 0]) tm.assert_frame_equal(df, expected, check_names=False) def test_multi_index_blank_df(self): # GH 14545 data = """a,b """ df = self.read_csv(StringIO(data), header=[0]) expected = DataFrame(columns=['a', 'b']) tm.assert_frame_equal(df, expected) round_trip = self.read_csv(StringIO( expected.to_csv(index=False)), header=[0]) tm.assert_frame_equal(round_trip, expected) data_multiline = """a,b c,d """ df2 = self.read_csv(StringIO(data_multiline), header=[0, 1]) cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) expected2 = DataFrame(columns=cols) tm.assert_frame_equal(df2, expected2) round_trip = self.read_csv(StringIO( expected2.to_csv(index=False)), header=[0, 1]) tm.assert_frame_equal(round_trip, expected2) def test_no_unnamed_index(self): data = """ id c0 c1 c2 0 1 0 a b 1 2 0 c d 2 2 2 e f """ df = self.read_table(StringIO(data), sep=' ') assert df.index.name is None def test_read_csv_parse_simple_list(self): text = """foo bar baz qux foo foo bar""" df = self.read_csv(StringIO(text), header=None) expected = DataFrame({0: ['foo', 'bar baz', 'qux foo', 'foo', 'bar']}) tm.assert_frame_equal(df, expected) @tm.network def test_url(self, datapath): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing @pytest.mark.slow def test_file(self, datapath): localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) try: url_table = self.read_table('file://localhost/' + localtable) except URLError: # fails on some systems pytest.skip("failing on %s" % ' '.join(platform.uname()).strip()) tm.assert_frame_equal(url_table, local_table) def test_path_pathlib(self): df = tm.makeDataFrame() result = tm.round_trip_pathlib(df.to_csv, lambda p: self.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) def test_path_localpath(self): df = tm.makeDataFrame() result = tm.round_trip_localpath( df.to_csv, lambda p: self.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) def test_nonexistent_path(self): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError path = '%s.csv' % tm.rands(10) pytest.raises(compat.FileNotFoundError, self.read_csv, path) def test_missing_trailing_delimiters(self): data = """A,B,C,D 1,2,3,4 1,3,3, 1,4,5""" result = self.read_csv(StringIO(data)) assert result['D'].isna()[1:].all() def test_skipinitialspace(self): s = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') sfile = StringIO(s) # it's 33 columns result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'], header=None, skipinitialspace=True) assert pd.isna(result.iloc[0, 29]) def test_utf16_bom_skiprows(self): # #2298 data = u("""skip this skip this too A\tB\tC 1\t2\t3 4\t5\t6""") data2 = u("""skip this skip this too A,B,C 1,2,3 4,5,6""") path = '__%s__.csv' % tm.rands(10) with tm.ensure_clean(path) as path: for sep, dat in [('\t', data), (',', data2)]: for enc in ['utf-16', 'utf-16le', 'utf-16be']: bytes = dat.encode(enc) with open(path, 'wb') as f: f.write(bytes) s = BytesIO(dat.encode('utf-8')) if compat.PY3: # somewhat False since the code never sees bytes from io import TextIOWrapper s = TextIOWrapper(s, encoding='utf-8') result = self.read_csv(path, encoding=enc, skiprows=2, sep=sep) expected = self.read_csv(s, encoding='utf-8', skiprows=2, sep=sep) s.close() tm.assert_frame_equal(result, expected) def test_utf16_example(self, datapath): path = datapath('io', 'parser', 'data', 'utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') assert len(result) == 50 if not compat.PY3: buf = BytesIO(open(path, 'rb').read()) result = self.read_table(buf, encoding='utf-16') assert len(result) == 50 def test_unicode_encoding(self, datapath): pth = datapath('io', 'parser', 'data', 'unicode_series.csv') result = self.read_csv(pth, header=None, encoding='latin-1') result = result.set_index(0) got = result[1][1632] expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)') assert got == expected def test_trailing_delimiters(self): # #2442. grumble grumble data = """A,B,C 1,2,3, 4,5,6, 7,8,9,""" result = self.read_csv(StringIO(data), index_col=False) expected = DataFrame({'A': [1, 4, 7], 'B': [2, 5, 8], 'C': [3, 6, 9]}) tm.assert_frame_equal(result, expected) def test_escapechar(self): # http://stackoverflow.com/questions/13824840/feature-request-for- # pandas-read-csv data = '''SEARCH_TERM,ACTUAL_URL "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" "SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa result = self.read_csv(StringIO(data), escapechar='\\', quotechar='"', encoding='utf-8') assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", ' 'IKEA:s 1700-tals serie') tm.assert_index_equal(result.columns, Index(['SEARCH_TERM', 'ACTUAL_URL'])) def test_int64_min_issues(self): # #2599 data = 'A,B\n0,0\n0,' result = self.read_csv(StringIO(data)) expected = DataFrame({'A': [0, 0], 'B': [0, np.nan]}) tm.assert_frame_equal(result, expected) def test_parse_integers_above_fp_precision(self): data = """Numbers 17007000002000191 17007000002000191 17007000002000191 17007000002000191 17007000002000192 17007000002000192 17007000002000192 17007000002000192 17007000002000192 17007000002000194""" result = self.read_csv(StringIO(data)) expected = DataFrame({'Numbers': [17007000002000191, 17007000002000191, 17007000002000191, 17007000002000191, 17007000002000192, 17007000002000192, 17007000002000192, 17007000002000192, 17007000002000192, 17007000002000194]}) tm.assert_series_equal(result['Numbers'], expected['Numbers']) def test_chunks_have_consistent_numerical_type(self): integers = [str(i) for i in range(499999)] data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) with tm.assert_produces_warning(False): df = self.read_csv(StringIO(data)) # Assert that types were coerced. assert type(df.a[0]) is np.float64 assert df.a.dtype == np.float def test_warn_if_chunks_have_mismatched_type(self): warning_type = False integers = [str(i) for i in range(499999)] data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) # see gh-3866: if chunks are different types and can't # be coerced using numerical types, then issue warning. if self.engine == 'c' and self.low_memory: warning_type = DtypeWarning with tm.assert_produces_warning(warning_type): df = self.read_csv(StringIO(data)) assert df.a.dtype == np.object def test_integer_overflow_bug(self): # see gh-2601 data = "65248E10 11\n55555E55 22\n" result = self.read_csv(StringIO(data), header=None, sep=' ') assert result[0].dtype == np.float64 result = self.read_csv(StringIO(data), header=None, sep=r'\s+') assert result[0].dtype == np.float64 def test_catch_too_many_names(self): # see gh-5156 data = """\ 1,2,3 4,,6 7,8,9 10,11,12\n""" pytest.raises(ValueError, self.read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd']) def test_ignore_leading_whitespace(self): # see gh-3374, gh-6607 data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' result = self.read_table(StringIO(data), sep=r'\s+') expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) tm.assert_frame_equal(result, expected) def test_chunk_begins_with_newline_whitespace(self): # see gh-10022 data = '\n hello\nworld\n' result = self.read_csv(StringIO(data), header=None) assert len(result) == 2 # see gh-9735: this issue is C parser-specific (bug when # parsing whitespace and characters at chunk boundary) if self.engine == 'c': chunk1 = 'a' * (1024 * 256 - 2) + '\na' chunk2 = '\n a' result = self.read_csv(StringIO(chunk1 + chunk2), header=None) expected = DataFrame(['a' * (1024 * 256 - 2), 'a', ' a']) tm.assert_frame_equal(result, expected) def test_empty_with_index(self): # see gh-10184 data = 'x,y' result = self.read_csv(StringIO(data), index_col=0) expected = DataFrame([], columns=['y'], index=Index([], name='x')) tm.assert_frame_equal(result, expected) def test_empty_with_multiindex(self): # see gh-10467 data = 'x,y,z' result = self.read_csv(StringIO(data), index_col=['x', 'y']) expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( [[]] * 2, names=['x', 'y'])) tm.assert_frame_equal(result, expected, check_index_type=False) def test_empty_with_reversed_multiindex(self): data = 'x,y,z' result = self.read_csv(StringIO(data), index_col=[1, 0]) expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( [[]] * 2, names=['y', 'x'])) tm.assert_frame_equal(result, expected, check_index_type=False) def test_float_parser(self): # see gh-9565 data = '45e-1,4.5,45.,inf,-inf' result = self.read_csv(StringIO(data), header=None) expected = DataFrame([[float(s) for s in data.split(',')]]) tm.assert_frame_equal(result, expected) def test_scientific_no_exponent(self): # see gh-12215 df = DataFrame.from_dict(OrderedDict([('w', ['2e']), ('x', ['3E']), ('y', ['42e']), ('z', ['632E'])])) data = df.to_csv(index=False) for prec in self.float_precision_choices: df_roundtrip = self.read_csv( StringIO(data), float_precision=prec) tm.assert_frame_equal(df_roundtrip, df) def test_int64_overflow(self): data = """ID 00013007854817840016671868 00013007854817840016749251 00013007854817840016754630 00013007854817840016781876 00013007854817840017028824 00013007854817840017963235 00013007854817840018860166""" # 13007854817840016671868 > UINT64_MAX, so this # will overflow and return object as the dtype. result = self.read_csv(StringIO(data)) assert result['ID'].dtype == object # 13007854817840016671868 > UINT64_MAX, so attempts # to cast to either int64 or uint64 will result in # an OverflowError being raised. for conv in (np.int64, np.uint64): pytest.raises(OverflowError, self.read_csv, StringIO(data), converters={'ID': conv}) # These numbers fall right inside the int64-uint64 range, # so they should be parsed as string. ui_max = np.iinfo(np.uint64).max i_max = np.iinfo(np.int64).max i_min = np.iinfo(np.int64).min for x in [i_max, i_min, ui_max]: result = self.read_csv(StringIO(str(x)), header=None) expected = DataFrame([x]) tm.assert_frame_equal(result, expected) # These numbers fall just outside the int64-uint64 range, # so they should be parsed as string. too_big = ui_max + 1 too_small = i_min - 1 for x in [too_big, too_small]: result = self.read_csv(StringIO(str(x)), header=None) expected = DataFrame([str(x)]) tm.assert_frame_equal(result, expected) # No numerical dtype can hold both negative and uint64 values, # so they should be cast as string. data = '-1\n' + str(2**63) expected = DataFrame([str(-1), str(2**63)]) result = self.read_csv(StringIO(data), header=None) tm.assert_frame_equal(result, expected) data = str(2**63) + '\n-1' expected = DataFrame([str(2**63), str(-1)]) result = self.read_csv(StringIO(data), header=None) tm.assert_frame_equal(result, expected) def test_empty_with_nrows_chunksize(self): # see gh-9535 expected = DataFrame([], columns=['foo', 'bar']) result = self.read_csv(StringIO('foo,bar\n'), nrows=10) tm.assert_frame_equal(result, expected) result = next(iter(self.read_csv( StringIO('foo,bar\n'), chunksize=10))) tm.assert_frame_equal(result, expected) def test_eof_states(self): # see gh-10728, gh-10548 # With skip_blank_lines = True expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) # gh-10728: WHITESPACE_LINE data = 'a,b,c\n4,5,6\n ' result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) # gh-10548: EAT_LINE_COMMENT data = 'a,b,c\n4,5,6\n#comment' result = self.read_csv(StringIO(data), comment='#') tm.assert_frame_equal(result, expected) # EAT_CRNL_NOP data = 'a,b,c\n4,5,6\n\r' result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) # EAT_COMMENT data = 'a,b,c\n4,5,6#comment' result = self.read_csv(StringIO(data), comment='#') tm.assert_frame_equal(result, expected) # SKIP_LINE data = 'a,b,c\n4,5,6\nskipme' result = self.read_csv(StringIO(data), skiprows=[2]) tm.assert_frame_equal(result, expected) # With skip_blank_lines = False # EAT_LINE_COMMENT data = 'a,b,c\n4,5,6\n#comment' result = self.read_csv( StringIO(data), comment='#', skip_blank_lines=False) expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # IN_FIELD data = 'a,b,c\n4,5,6\n ' result = self.read_csv(StringIO(data), skip_blank_lines=False) expected = DataFrame( [['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # EAT_CRNL data = 'a,b,c\n4,5,6\n\r' result = self.read_csv(StringIO(data), skip_blank_lines=False) expected = DataFrame( [[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # Should produce exceptions # ESCAPED_CHAR data = "a,b,c\n4,5,6\n\\" pytest.raises(Exception, self.read_csv, StringIO(data), escapechar='\\') # ESCAPE_IN_QUOTED_FIELD data = 'a,b,c\n4,5,6\n"\\' pytest.raises(Exception, self.read_csv, StringIO(data), escapechar='\\') # IN_QUOTED_FIELD data = 'a,b,c\n4,5,6\n"' pytest.raises(Exception, self.read_csv, StringIO(data), escapechar='\\') def test_uneven_lines_with_usecols(self): # See gh-12203 csv = r"""a,b,c 0,1,2 3,4,5,6,7 8,9,10 """ # make sure that an error is still thrown # when the 'usecols' parameter is not provided msg = r"Expected \d+ fields in line \d+, saw \d+" with tm.assert_raises_regex(ValueError, msg): df = self.read_csv(StringIO(csv)) expected = DataFrame({ 'a': [0, 3, 8], 'b': [1, 4, 9] }) usecols = [0, 1] df = self.read_csv(StringIO(csv), usecols=usecols) tm.assert_frame_equal(df, expected) usecols = ['a', 'b'] df = self.read_csv(StringIO(csv), usecols=usecols) tm.assert_frame_equal(df, expected) def test_read_empty_with_usecols(self): # See gh-12493 names = ['Dummy', 'X', 'Dummy_2'] usecols = names[1:2] # ['X'] # first, check to see that the response of # parser when faced with no provided columns # throws the correct error, with or without usecols errmsg = "No columns to parse from file" with tm.assert_raises_regex(EmptyDataError, errmsg): self.read_csv(StringIO('')) with tm.assert_raises_regex(EmptyDataError, errmsg): self.read_csv(StringIO(''), usecols=usecols) expected = DataFrame(columns=usecols, index=[0], dtype=np.float64) df = self.read_csv(StringIO(',,'), names=names, usecols=usecols) tm.assert_frame_equal(df, expected) expected = DataFrame(columns=usecols) df = self.read_csv(StringIO(''), names=names, usecols=usecols) tm.assert_frame_equal(df, expected) def test_trailing_spaces(self): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa expected = DataFrame([[1., 2., 4.], [5.1, np.nan, 10.]]) # gh-8661, gh-8679: this should ignore six lines including # lines with trailing whitespace and blank lines df = self.read_csv(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) # gh-8983: test skipping set of rows after a row with trailing spaces expected = DataFrame({"A": [1., 5.1], "B": [2., np.nan], "C": [4., 10]}) df = self.read_table(StringIO(data.replace(',', ' ')), delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) def test_raise_on_sep_with_delim_whitespace(self): # see gh-6607 data = 'a b c\n1 2 3' with tm.assert_raises_regex(ValueError, 'you can only specify one'): self.read_table(StringIO(data), sep=r'\s', delim_whitespace=True) def test_single_char_leading_whitespace(self): # see gh-9710 data = """\ MyColumn a b a b\n""" expected = DataFrame({'MyColumn': list('abab')}) result = self.read_csv(StringIO(data), delim_whitespace=True, skipinitialspace=True) tm.assert_frame_equal(result, expected) result = self.read_csv(StringIO(data), skipinitialspace=True) tm.assert_frame_equal(result, expected) def test_empty_lines(self): data = """\ A,B,C 1,2.,4. 5.,NaN,10.0 -70,.4,1 """ expected = np.array([[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]) df = self.read_csv(StringIO(data)) tm.assert_numpy_array_equal(df.values, expected) df = self.read_csv(StringIO(data.replace(',', ' ')), sep=r'\s+') tm.assert_numpy_array_equal(df.values, expected) expected = np.array([[1., 2., 4.], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], [5., np.nan, 10.], [np.nan, np.nan, np.nan], [-70., .4, 1.]]) df = self.read_csv(StringIO(data), skip_blank_lines=False) tm.assert_numpy_array_equal(df.values, expected) def test_whitespace_lines(self): data = """ \t \t\t \t A,B,C \t 1,2.,4. 5.,NaN,10.0 """ expected = np.array([[1, 2., 4.], [5., np.nan, 10.]]) df = self.read_csv(StringIO(data)) tm.assert_numpy_array_equal(df.values, expected) def test_regex_separator(self): # see gh-6607 data = """ A B C D a 1 2 3 4 b 1 2 3 4 c 1 2 3 4 """ df = self.read_table(StringIO(data), sep=r'\s+') expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), index_col=0) assert expected.index.name is None tm.assert_frame_equal(df, expected) data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' result = self.read_table(StringIO(data), sep=r'\s+') expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) @tm.capture_stdout def test_verbose_import(self): text = """a,b,c,d one,1,2,3 one,1,2,3 ,1,2,3 one,1,2,3 ,1,2,3 ,1,2,3 one,1,2,3 two,1,2,3""" # Engines are verbose in different ways. self.read_csv(StringIO(text), verbose=True) output = sys.stdout.getvalue() if self.engine == 'c': assert 'Tokenization took:' in output assert 'Parser memory cleanup took:' in output else: # Python engine assert output == 'Filled 3 NA values in column a\n' # Reset the stdout buffer. sys.stdout = StringIO() text = """a,b,c,d one,1,2,3 two,1,2,3 three,1,2,3 four,1,2,3 five,1,2,3 ,1,2,3 seven,1,2,3 eight,1,2,3""" self.read_csv(StringIO(text), verbose=True, index_col=0) output = sys.stdout.getvalue() # Engines are verbose in different ways. if self.engine == 'c': assert 'Tokenization took:' in output assert 'Parser memory cleanup took:' in output else: # Python engine assert output == 'Filled 1 NA values in column a\n' @pytest.mark.skipif(PY3, reason="won't work in Python 3") def test_iteration_open_handle(self): with tm.ensure_clean() as path: with open(path, 'wb') as f: f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG') with open(path, 'rb') as f: for line in f: if 'CCC' in line: break if self.engine == 'c': pytest.raises(Exception, self.read_table, f, squeeze=True, header=None) else: result = self.read_table(f, squeeze=True, header=None) expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) tm.assert_series_equal(result, expected) def test_1000_sep_with_decimal(self): data = """A|B|C 1|2,334.01|5 10|13|10. """ expected = DataFrame({ 'A': [1, 10], 'B': [2334.01, 13], 'C': [5, 10.] }) assert expected.A.dtype == 'int64' assert expected.B.dtype == 'float' assert expected.C.dtype == 'float' df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.') tm.assert_frame_equal(df, expected) data_with_odd_sep = """A|B|C 1|2.334,01|5 10|13|10, """ df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') tm.assert_frame_equal(df, expected) def test_euro_decimal_format(self): data = """Id;Number1;Number2;Text1;Text2;Number3 1;1521,1541;187101,9543;ABC;poi;4,738797819 2;121,12;14897,76;DEF;uyt;0,377320872 3;878,158;108013,434;GHI;rez;2,735694704""" df2 = self.read_csv(StringIO(data), sep=';', decimal=',') assert df2['Number1'].dtype == float assert df2['Number2'].dtype == float assert df2['Number3'].dtype == float def test_inf_parsing(self): data = """\ ,A a,inf b,-inf c,+Inf d,-Inf e,INF f,-INF g,+INf h,-INf i,inF j,-inF""" inf = float('inf') expected = Series([inf, -inf] * 5) df = self.read_csv(StringIO(data), index_col=0) tm.assert_almost_equal(df['A'].values, expected.values) df = self.read_csv(StringIO(data), index_col=0, na_filter=False) tm.assert_almost_equal(df['A'].values, expected.values) def test_raise_on_no_columns(self): # single newline data = "\n" pytest.raises(EmptyDataError, self.read_csv, StringIO(data)) # test with more than a single newline data = "\n\n\n" pytest.raises(EmptyDataError, self.read_csv, StringIO(data)) def test_memory_map(self): mmap_file = os.path.join(self.dirpath, 'test_mmap.csv') expected = DataFrame({ 'a': [1, 2, 3], 'b': ['one', 'two', 'three'], 'c': ['I', 'II', 'III'] }) out = self.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(out, expected) def test_null_byte_char(self): # see gh-2741 data = '\x00,foo' cols = ['a', 'b'] expected = DataFrame([[np.nan, 'foo']], columns=cols) if self.engine == 'c': out = self.read_csv(StringIO(data), names=cols) tm.assert_frame_equal(out, expected) else: msg = "NULL byte detected" with tm.assert_raises_regex(ParserError, msg): self.read_csv(StringIO(data), names=cols) def test_utf8_bom(self): # see gh-4793 bom = u('\ufeff') utf8 = 'utf-8' def _encode_data_with_bom(_data): bom_data = (bom + _data).encode(utf8) return BytesIO(bom_data) # basic test data = 'a\n1' expected = DataFrame({'a': [1]}) out = self.read_csv(_encode_data_with_bom(data), encoding=utf8) tm.assert_frame_equal(out, expected) # test with "regular" quoting data = '"a"\n1' expected = DataFrame({'a': [1]}) out = self.read_csv(_encode_data_with_bom(data), encoding=utf8, quotechar='"') tm.assert_frame_equal(out, expected) # test in a data row instead of header data = 'b\n1' expected = DataFrame({'a': ['b', '1']}) out = self.read_csv(_encode_data_with_bom(data), encoding=utf8, names=['a']) tm.assert_frame_equal(out, expected) # test in empty data row with skipping data = '\n1' expected = DataFrame({'a': [1]}) out = self.read_csv(_encode_data_with_bom(data), encoding=utf8, names=['a'], skip_blank_lines=True) tm.assert_frame_equal(out, expected) # test in empty data row without skipping data = '\n1' expected = DataFrame({'a': [np.nan, 1.0]}) out = self.read_csv(_encode_data_with_bom(data), encoding=utf8, names=['a'], skip_blank_lines=False) tm.assert_frame_equal(out, expected) def test_temporary_file(self): # see gh-13398 data1 = "0 0" from tempfile import TemporaryFile new_file = TemporaryFile("w+") new_file.write(data1) new_file.flush() new_file.seek(0) result = self.read_csv(new_file, sep=r'\s+', header=None) new_file.close() expected = DataFrame([[0, 0]]) tm.assert_frame_equal(result, expected) def test_read_csv_utf_aliases(self): # see gh issue 13549 expected = pd.DataFrame({'mb_num': [4.8], 'multibyte': ['test']}) for byte in [8, 16]: for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']: encoding = fmt.format(byte) data = 'mb_num,multibyte\n4.8,test'.encode(encoding) result = self.read_csv(BytesIO(data), encoding=encoding) tm.assert_frame_equal(result, expected) def test_internal_eof_byte(self): # see gh-5500 data = "a,b\n1\x1a,2" expected = pd.DataFrame([["1\x1a", 2]], columns=['a', 'b']) result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) def test_internal_eof_byte_to_file(self): # see gh-16559 data = b'c1,c2\r\n"test \x1a test", test\r\n' expected = pd.DataFrame([["test \x1a test", " test"]], columns=["c1", "c2"]) path = '__%s__.csv' % tm.rands(10) with tm.ensure_clean(path) as path: with open(path, "wb") as f: f.write(data) result = self.read_csv(path) tm.assert_frame_equal(result, expected) def test_sub_character(self, datapath): # see gh-16893 filename = datapath('io', 'parser', 'data', 'sub_char.csv') expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) result = self.read_csv(filename) tm.assert_frame_equal(result, expected) def test_file_handles(self): # GH 14418 - don't close user provided file handles fh = StringIO('a,b\n1,2') self.read_csv(fh) assert not fh.closed with open(self.csv1, 'r') as f: self.read_csv(f) assert not f.closed # mmap not working with python engine if self.engine != 'python': import mmap with open(self.csv1, 'r') as f: m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) self.read_csv(m) # closed attribute new in python 3.2 if PY3: assert not m.closed m.close() def test_invalid_file_buffer(self): # see gh-15337 class InvalidBuffer(object): pass msg = "Invalid file path or buffer object type" with tm.assert_raises_regex(ValueError, msg): self.read_csv(InvalidBuffer()) # gh-16135: we want to ensure that "tell" and "seek" # aren't actually being used when we call `read_csv` # # Thus, while the object may look "invalid" (these # methods are attributes of the `StringIO` class), # it is still a valid file-object for our purposes. class NoSeekTellBuffer(StringIO): def tell(self): raise AttributeError("No tell method") def seek(self, pos, whence=0): raise AttributeError("No seek method") data = "a\n1" expected = pd.DataFrame({"a": [1]}) result = self.read_csv(NoSeekTellBuffer(data)) tm.assert_frame_equal(result, expected) if PY3: from unittest import mock with tm.assert_raises_regex(ValueError, msg): self.read_csv(mock.Mock()) @tm.capture_stderr def test_skip_bad_lines(self): # see gh-15925 data = 'a\n1\n1,2,3\n4\n5,6,7' with pytest.raises(ParserError): self.read_csv(StringIO(data)) with pytest.raises(ParserError): self.read_csv(StringIO(data), error_bad_lines=True) expected = DataFrame({'a': [1, 4]}) out = self.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=False) tm.assert_frame_equal(out, expected) val = sys.stderr.getvalue() assert val == '' # Reset the stderr buffer. sys.stderr = StringIO() out = self.read_csv(StringIO(data), error_bad_lines=False, warn_bad_lines=True) tm.assert_frame_equal(out, expected) val = sys.stderr.getvalue() assert 'Skipping line 3' in val assert 'Skipping line 5' in val