1614 lines
53 KiB
Python
1614 lines
53 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import csv
|
|
import os
|
|
import platform
|
|
import codecs
|
|
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
from collections import OrderedDict
|
|
|
|
import pytest
|
|
import numpy as np
|
|
from pandas._libs.tslib import Timestamp
|
|
|
|
import pandas as pd
|
|
import pandas.util.testing as tm
|
|
from pandas import DataFrame, Series, Index, MultiIndex
|
|
from pandas import compat
|
|
from pandas.compat import (StringIO, BytesIO, PY3,
|
|
range, lrange, u)
|
|
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
|
|
from pandas.io.common import URLError
|
|
from pandas.io.parsers import TextFileReader, TextParser
|
|
|
|
|
|
class ParserTests(object):
|
|
"""
|
|
Want to be able to test either C+Cython or Python+Cython parsers
|
|
"""
|
|
data1 = """index,A,B,C,D
|
|
foo,2,3,4,5
|
|
bar,7,8,9,10
|
|
baz,12,13,14,15
|
|
qux,12,13,14,15
|
|
foo2,12,13,14,15
|
|
bar2,12,13,14,15
|
|
"""
|
|
|
|
def test_empty_decimal_marker(self):
|
|
data = """A|B|C
|
|
1|2,334|5
|
|
10|13|10.
|
|
"""
|
|
# Parsers support only length-1 decimals
|
|
msg = 'Only length-1 decimal markers supported'
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(StringIO(data), decimal='')
|
|
|
|
def test_bad_stream_exception(self):
|
|
# Issue 13652:
|
|
# This test validates that both python engine
|
|
# and C engine will raise UnicodeDecodeError instead of
|
|
# c engine raising ParserError and swallowing exception
|
|
# that caused read to fail.
|
|
codec = codecs.lookup("utf-8")
|
|
utf8 = codecs.lookup('utf-8')
|
|
|
|
if compat.PY3:
|
|
msg = "'utf-8' codec can't decode byte"
|
|
else:
|
|
msg = "'utf8' codec can't decode byte"
|
|
|
|
# stream must be binary UTF8
|
|
with open(self.csv_shiftjs, "rb") as handle, codecs.StreamRecoder(
|
|
handle, utf8.encode, utf8.decode, codec.streamreader,
|
|
codec.streamwriter) as stream:
|
|
|
|
with tm.assert_raises_regex(UnicodeDecodeError, msg):
|
|
self.read_csv(stream)
|
|
|
|
def test_read_csv(self):
|
|
if not compat.PY3:
|
|
if compat.is_platform_windows():
|
|
prefix = u("file:///")
|
|
else:
|
|
prefix = u("file://")
|
|
|
|
fname = prefix + compat.text_type(os.path.abspath(self.csv1))
|
|
self.read_csv(fname, index_col=0, parse_dates=True)
|
|
|
|
def test_1000_sep(self):
|
|
data = """A|B|C
|
|
1|2,334|5
|
|
10|13|10.
|
|
"""
|
|
expected = DataFrame({
|
|
'A': [1, 10],
|
|
'B': [2334, 13],
|
|
'C': [5, 10.]
|
|
})
|
|
|
|
df = self.read_csv(StringIO(data), sep='|', thousands=',')
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_table(StringIO(data), sep='|', thousands=',')
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_squeeze(self):
|
|
data = """\
|
|
a,1
|
|
b,2
|
|
c,3
|
|
"""
|
|
idx = Index(['a', 'b', 'c'], name=0)
|
|
expected = Series([1, 2, 3], name=1, index=idx)
|
|
result = self.read_table(StringIO(data), sep=',', index_col=0,
|
|
header=None, squeeze=True)
|
|
assert isinstance(result, Series)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_squeeze_no_view(self):
|
|
# see gh-8217
|
|
# Series should not be a view
|
|
data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13"""
|
|
result = self.read_csv(StringIO(data), index_col='time', squeeze=True)
|
|
assert not result._is_view
|
|
|
|
def test_malformed(self):
|
|
# see gh-6607
|
|
|
|
# all
|
|
data = """ignore
|
|
A,B,C
|
|
1,2,3 # comment
|
|
1,2,3,4,5
|
|
2,3,4
|
|
"""
|
|
msg = 'Expected 3 fields in line 4, saw 5'
|
|
with tm.assert_raises_regex(Exception, msg):
|
|
self.read_table(StringIO(data), sep=',',
|
|
header=1, comment='#')
|
|
|
|
# first chunk
|
|
data = """ignore
|
|
A,B,C
|
|
skip
|
|
1,2,3
|
|
3,5,10 # comment
|
|
1,2,3,4,5
|
|
2,3,4
|
|
"""
|
|
msg = 'Expected 3 fields in line 6, saw 5'
|
|
with tm.assert_raises_regex(Exception, msg):
|
|
it = self.read_table(StringIO(data), sep=',',
|
|
header=1, comment='#',
|
|
iterator=True, chunksize=1,
|
|
skiprows=[2])
|
|
it.read(5)
|
|
|
|
# middle chunk
|
|
data = """ignore
|
|
A,B,C
|
|
skip
|
|
1,2,3
|
|
3,5,10 # comment
|
|
1,2,3,4,5
|
|
2,3,4
|
|
"""
|
|
msg = 'Expected 3 fields in line 6, saw 5'
|
|
with tm.assert_raises_regex(Exception, msg):
|
|
it = self.read_table(StringIO(data), sep=',', header=1,
|
|
comment='#', iterator=True, chunksize=1,
|
|
skiprows=[2])
|
|
it.read(3)
|
|
|
|
# last chunk
|
|
data = """ignore
|
|
A,B,C
|
|
skip
|
|
1,2,3
|
|
3,5,10 # comment
|
|
1,2,3,4,5
|
|
2,3,4
|
|
"""
|
|
msg = 'Expected 3 fields in line 6, saw 5'
|
|
with tm.assert_raises_regex(Exception, msg):
|
|
it = self.read_table(StringIO(data), sep=',', header=1,
|
|
comment='#', iterator=True, chunksize=1,
|
|
skiprows=[2])
|
|
it.read()
|
|
|
|
# skipfooter is not supported with the C parser yet
|
|
if self.engine == 'python':
|
|
# skipfooter
|
|
data = """ignore
|
|
A,B,C
|
|
1,2,3 # comment
|
|
1,2,3,4,5
|
|
2,3,4
|
|
footer
|
|
"""
|
|
msg = 'Expected 3 fields in line 4, saw 5'
|
|
with tm.assert_raises_regex(Exception, msg):
|
|
self.read_table(StringIO(data), sep=',',
|
|
header=1, comment='#',
|
|
skipfooter=1)
|
|
|
|
def test_quoting(self):
|
|
bad_line_small = """printer\tresult\tvariant_name
|
|
Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jacob
|
|
Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jakob
|
|
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\t"Furststiftische Hofdruckerei, <Kempten""
|
|
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tGaller, Alois
|
|
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tHochfurstliche Buchhandlung <Kempten>""" # noqa
|
|
pytest.raises(Exception, self.read_table, StringIO(bad_line_small),
|
|
sep='\t')
|
|
|
|
good_line_small = bad_line_small + '"'
|
|
df = self.read_table(StringIO(good_line_small), sep='\t')
|
|
assert len(df) == 3
|
|
|
|
def test_unnamed_columns(self):
|
|
data = """A,B,C,,
|
|
1,2,3,4,5
|
|
6,7,8,9,10
|
|
11,12,13,14,15
|
|
"""
|
|
expected = np.array([[1, 2, 3, 4, 5],
|
|
[6, 7, 8, 9, 10],
|
|
[11, 12, 13, 14, 15]], dtype=np.int64)
|
|
df = self.read_table(StringIO(data), sep=',')
|
|
tm.assert_almost_equal(df.values, expected)
|
|
tm.assert_index_equal(df.columns,
|
|
Index(['A', 'B', 'C', 'Unnamed: 3',
|
|
'Unnamed: 4']))
|
|
|
|
def test_csv_mixed_type(self):
|
|
data = """A,B,C
|
|
a,1,2
|
|
b,3,4
|
|
c,4,5
|
|
"""
|
|
expected = DataFrame({'A': ['a', 'b', 'c'],
|
|
'B': [1, 3, 4],
|
|
'C': [2, 4, 5]})
|
|
out = self.read_csv(StringIO(data))
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_read_csv_low_memory_no_rows_with_index(self):
|
|
if self.engine == "c" and not self.low_memory:
|
|
pytest.skip("This is a low-memory specific test")
|
|
|
|
# see gh-21141
|
|
data = """A,B,C
|
|
1,1,1,2
|
|
2,2,3,4
|
|
3,3,4,5
|
|
"""
|
|
out = self.read_csv(StringIO(data), low_memory=True,
|
|
index_col=0, nrows=0)
|
|
expected = DataFrame(columns=["A", "B", "C"])
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_read_csv_dataframe(self):
|
|
df = self.read_csv(self.csv1, index_col=0, parse_dates=True)
|
|
df2 = self.read_table(self.csv1, sep=',', index_col=0,
|
|
parse_dates=True)
|
|
tm.assert_index_equal(df.columns, pd.Index(['A', 'B', 'C', 'D']))
|
|
assert df.index.name == 'index'
|
|
assert isinstance(
|
|
df.index[0], (datetime, np.datetime64, Timestamp))
|
|
assert df.values.dtype == np.float64
|
|
tm.assert_frame_equal(df, df2)
|
|
|
|
def test_read_csv_no_index_name(self):
|
|
df = self.read_csv(self.csv2, index_col=0, parse_dates=True)
|
|
df2 = self.read_table(self.csv2, sep=',', index_col=0,
|
|
parse_dates=True)
|
|
tm.assert_index_equal(df.columns,
|
|
pd.Index(['A', 'B', 'C', 'D', 'E']))
|
|
assert isinstance(df.index[0], (datetime, np.datetime64, Timestamp))
|
|
assert df.loc[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64
|
|
tm.assert_frame_equal(df, df2)
|
|
|
|
def test_read_table_unicode(self):
|
|
fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
|
|
df1 = self.read_table(fin, sep=";", encoding="utf-8", header=None)
|
|
assert isinstance(df1[0].values[0], compat.text_type)
|
|
|
|
def test_read_table_wrong_num_columns(self):
|
|
# too few!
|
|
data = """A,B,C,D,E,F
|
|
1,2,3,4,5,6
|
|
6,7,8,9,10,11,12
|
|
11,12,13,14,15,16
|
|
"""
|
|
pytest.raises(ValueError, self.read_csv, StringIO(data))
|
|
|
|
def test_read_duplicate_index_explicit(self):
|
|
data = """index,A,B,C,D
|
|
foo,2,3,4,5
|
|
bar,7,8,9,10
|
|
baz,12,13,14,15
|
|
qux,12,13,14,15
|
|
foo,12,13,14,15
|
|
bar,12,13,14,15
|
|
"""
|
|
|
|
result = self.read_csv(StringIO(data), index_col=0)
|
|
expected = self.read_csv(StringIO(data)).set_index(
|
|
'index', verify_integrity=False)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = self.read_table(StringIO(data), sep=',', index_col=0)
|
|
expected = self.read_table(StringIO(data), sep=',', ).set_index(
|
|
'index', verify_integrity=False)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_read_duplicate_index_implicit(self):
|
|
data = """A,B,C,D
|
|
foo,2,3,4,5
|
|
bar,7,8,9,10
|
|
baz,12,13,14,15
|
|
qux,12,13,14,15
|
|
foo,12,13,14,15
|
|
bar,12,13,14,15
|
|
"""
|
|
|
|
# make sure an error isn't thrown
|
|
self.read_csv(StringIO(data))
|
|
self.read_table(StringIO(data), sep=',')
|
|
|
|
def test_parse_bools(self):
|
|
data = """A,B
|
|
True,1
|
|
False,2
|
|
True,3
|
|
"""
|
|
data = self.read_csv(StringIO(data))
|
|
assert data['A'].dtype == np.bool_
|
|
|
|
data = """A,B
|
|
YES,1
|
|
no,2
|
|
yes,3
|
|
No,3
|
|
Yes,3
|
|
"""
|
|
data = self.read_csv(StringIO(data),
|
|
true_values=['yes', 'Yes', 'YES'],
|
|
false_values=['no', 'NO', 'No'])
|
|
assert data['A'].dtype == np.bool_
|
|
|
|
data = """A,B
|
|
TRUE,1
|
|
FALSE,2
|
|
TRUE,3
|
|
"""
|
|
data = self.read_csv(StringIO(data))
|
|
assert data['A'].dtype == np.bool_
|
|
|
|
data = """A,B
|
|
foo,bar
|
|
bar,foo"""
|
|
result = self.read_csv(StringIO(data), true_values=['foo'],
|
|
false_values=['bar'])
|
|
expected = DataFrame({'A': [True, False], 'B': [False, True]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_int_conversion(self):
|
|
data = """A,B
|
|
1.0,1
|
|
2.0,2
|
|
3.0,3
|
|
"""
|
|
data = self.read_csv(StringIO(data))
|
|
assert data['A'].dtype == np.float64
|
|
assert data['B'].dtype == np.int64
|
|
|
|
def test_read_nrows(self):
|
|
expected = self.read_csv(StringIO(self.data1))[:3]
|
|
|
|
df = self.read_csv(StringIO(self.data1), nrows=3)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# see gh-10476
|
|
df = self.read_csv(StringIO(self.data1), nrows=3.0)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
msg = r"'nrows' must be an integer >=0"
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(StringIO(self.data1), nrows=1.2)
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(StringIO(self.data1), nrows='foo')
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(StringIO(self.data1), nrows=-1)
|
|
|
|
def test_read_chunksize(self):
|
|
reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2)
|
|
df = self.read_csv(StringIO(self.data1), index_col=0)
|
|
|
|
chunks = list(reader)
|
|
|
|
tm.assert_frame_equal(chunks[0], df[:2])
|
|
tm.assert_frame_equal(chunks[1], df[2:4])
|
|
tm.assert_frame_equal(chunks[2], df[4:])
|
|
|
|
# with invalid chunksize value:
|
|
msg = r"'chunksize' must be an integer >=1"
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(StringIO(self.data1), chunksize=1.3)
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(StringIO(self.data1), chunksize='foo')
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(StringIO(self.data1), chunksize=0)
|
|
|
|
def test_read_chunksize_and_nrows(self):
|
|
|
|
# gh-15755
|
|
# With nrows
|
|
reader = self.read_csv(StringIO(self.data1), index_col=0,
|
|
chunksize=2, nrows=5)
|
|
df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
|
|
|
|
tm.assert_frame_equal(pd.concat(reader), df)
|
|
|
|
# chunksize > nrows
|
|
reader = self.read_csv(StringIO(self.data1), index_col=0,
|
|
chunksize=8, nrows=5)
|
|
df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
|
|
|
|
tm.assert_frame_equal(pd.concat(reader), df)
|
|
|
|
# with changing "size":
|
|
reader = self.read_csv(StringIO(self.data1), index_col=0,
|
|
chunksize=8, nrows=5)
|
|
df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
|
|
|
|
tm.assert_frame_equal(reader.get_chunk(size=2), df.iloc[:2])
|
|
tm.assert_frame_equal(reader.get_chunk(size=4), df.iloc[2:5])
|
|
with pytest.raises(StopIteration):
|
|
reader.get_chunk(size=3)
|
|
|
|
def test_read_chunksize_named(self):
|
|
reader = self.read_csv(
|
|
StringIO(self.data1), index_col='index', chunksize=2)
|
|
df = self.read_csv(StringIO(self.data1), index_col='index')
|
|
|
|
chunks = list(reader)
|
|
|
|
tm.assert_frame_equal(chunks[0], df[:2])
|
|
tm.assert_frame_equal(chunks[1], df[2:4])
|
|
tm.assert_frame_equal(chunks[2], df[4:])
|
|
|
|
def test_get_chunk_passed_chunksize(self):
|
|
data = """A,B,C
|
|
1,2,3
|
|
4,5,6
|
|
7,8,9
|
|
1,2,3"""
|
|
result = self.read_csv(StringIO(data), chunksize=2)
|
|
|
|
piece = result.get_chunk()
|
|
assert len(piece) == 2
|
|
|
|
def test_read_chunksize_generated_index(self):
|
|
# GH 12185
|
|
reader = self.read_csv(StringIO(self.data1), chunksize=2)
|
|
df = self.read_csv(StringIO(self.data1))
|
|
|
|
tm.assert_frame_equal(pd.concat(reader), df)
|
|
|
|
reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0)
|
|
df = self.read_csv(StringIO(self.data1), index_col=0)
|
|
|
|
tm.assert_frame_equal(pd.concat(reader), df)
|
|
|
|
def test_read_text_list(self):
|
|
data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
|
|
as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar',
|
|
'4', '5', '6']]
|
|
df = self.read_csv(StringIO(data), index_col=0)
|
|
|
|
parser = TextParser(as_list, index_col=0, chunksize=2)
|
|
chunk = parser.read(None)
|
|
|
|
tm.assert_frame_equal(chunk, df)
|
|
|
|
def test_iterator(self):
|
|
# See gh-6607
|
|
reader = self.read_csv(StringIO(self.data1), index_col=0,
|
|
iterator=True)
|
|
df = self.read_csv(StringIO(self.data1), index_col=0)
|
|
|
|
chunk = reader.read(3)
|
|
tm.assert_frame_equal(chunk, df[:3])
|
|
|
|
last_chunk = reader.read(5)
|
|
tm.assert_frame_equal(last_chunk, df[3:])
|
|
|
|
# pass list
|
|
lines = list(csv.reader(StringIO(self.data1)))
|
|
parser = TextParser(lines, index_col=0, chunksize=2)
|
|
|
|
df = self.read_csv(StringIO(self.data1), index_col=0)
|
|
|
|
chunks = list(parser)
|
|
tm.assert_frame_equal(chunks[0], df[:2])
|
|
tm.assert_frame_equal(chunks[1], df[2:4])
|
|
tm.assert_frame_equal(chunks[2], df[4:])
|
|
|
|
# pass skiprows
|
|
parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
|
|
chunks = list(parser)
|
|
tm.assert_frame_equal(chunks[0], df[1:3])
|
|
|
|
treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
|
|
iterator=True)
|
|
assert isinstance(treader, TextFileReader)
|
|
|
|
# gh-3967: stopping iteration when chunksize is specified
|
|
data = """A,B,C
|
|
foo,1,2,3
|
|
bar,4,5,6
|
|
baz,7,8,9
|
|
"""
|
|
reader = self.read_csv(StringIO(data), iterator=True)
|
|
result = list(reader)
|
|
expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
|
|
3, 6, 9]), index=['foo', 'bar', 'baz'])
|
|
tm.assert_frame_equal(result[0], expected)
|
|
|
|
# chunksize = 1
|
|
reader = self.read_csv(StringIO(data), chunksize=1)
|
|
result = list(reader)
|
|
expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
|
|
3, 6, 9]), index=['foo', 'bar', 'baz'])
|
|
assert len(result) == 3
|
|
tm.assert_frame_equal(pd.concat(result), expected)
|
|
|
|
# skipfooter is not supported with the C parser yet
|
|
if self.engine == 'python':
|
|
# test bad parameter (skipfooter)
|
|
reader = self.read_csv(StringIO(self.data1), index_col=0,
|
|
iterator=True, skipfooter=1)
|
|
pytest.raises(ValueError, reader.read, 3)
|
|
|
|
def test_pass_names_with_index(self):
|
|
lines = self.data1.split('\n')
|
|
no_header = '\n'.join(lines[1:])
|
|
|
|
# regular index
|
|
names = ['index', 'A', 'B', 'C', 'D']
|
|
df = self.read_csv(StringIO(no_header), index_col=0, names=names)
|
|
expected = self.read_csv(StringIO(self.data1), index_col=0)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# multi index
|
|
data = """index1,index2,A,B,C,D
|
|
foo,one,2,3,4,5
|
|
foo,two,7,8,9,10
|
|
foo,three,12,13,14,15
|
|
bar,one,12,13,14,15
|
|
bar,two,12,13,14,15
|
|
"""
|
|
lines = data.split('\n')
|
|
no_header = '\n'.join(lines[1:])
|
|
names = ['index1', 'index2', 'A', 'B', 'C', 'D']
|
|
df = self.read_csv(StringIO(no_header), index_col=[0, 1],
|
|
names=names)
|
|
expected = self.read_csv(StringIO(data), index_col=[0, 1])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_csv(StringIO(data), index_col=['index1', 'index2'])
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_multi_index_no_level_names(self):
|
|
data = """index1,index2,A,B,C,D
|
|
foo,one,2,3,4,5
|
|
foo,two,7,8,9,10
|
|
foo,three,12,13,14,15
|
|
bar,one,12,13,14,15
|
|
bar,two,12,13,14,15
|
|
"""
|
|
|
|
data2 = """A,B,C,D
|
|
foo,one,2,3,4,5
|
|
foo,two,7,8,9,10
|
|
foo,three,12,13,14,15
|
|
bar,one,12,13,14,15
|
|
bar,two,12,13,14,15
|
|
"""
|
|
|
|
lines = data.split('\n')
|
|
no_header = '\n'.join(lines[1:])
|
|
names = ['A', 'B', 'C', 'D']
|
|
|
|
df = self.read_csv(StringIO(no_header), index_col=[0, 1],
|
|
header=None, names=names)
|
|
expected = self.read_csv(StringIO(data), index_col=[0, 1])
|
|
tm.assert_frame_equal(df, expected, check_names=False)
|
|
|
|
# 2 implicit first cols
|
|
df2 = self.read_csv(StringIO(data2))
|
|
tm.assert_frame_equal(df2, df)
|
|
|
|
# reverse order of index
|
|
df = self.read_csv(StringIO(no_header), index_col=[1, 0], names=names,
|
|
header=None)
|
|
expected = self.read_csv(StringIO(data), index_col=[1, 0])
|
|
tm.assert_frame_equal(df, expected, check_names=False)
|
|
|
|
def test_multi_index_blank_df(self):
|
|
# GH 14545
|
|
data = """a,b
|
|
"""
|
|
df = self.read_csv(StringIO(data), header=[0])
|
|
expected = DataFrame(columns=['a', 'b'])
|
|
tm.assert_frame_equal(df, expected)
|
|
round_trip = self.read_csv(StringIO(
|
|
expected.to_csv(index=False)), header=[0])
|
|
tm.assert_frame_equal(round_trip, expected)
|
|
|
|
data_multiline = """a,b
|
|
c,d
|
|
"""
|
|
df2 = self.read_csv(StringIO(data_multiline), header=[0, 1])
|
|
cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')])
|
|
expected2 = DataFrame(columns=cols)
|
|
tm.assert_frame_equal(df2, expected2)
|
|
round_trip = self.read_csv(StringIO(
|
|
expected2.to_csv(index=False)), header=[0, 1])
|
|
tm.assert_frame_equal(round_trip, expected2)
|
|
|
|
def test_no_unnamed_index(self):
|
|
data = """ id c0 c1 c2
|
|
0 1 0 a b
|
|
1 2 0 c d
|
|
2 2 2 e f
|
|
"""
|
|
df = self.read_table(StringIO(data), sep=' ')
|
|
assert df.index.name is None
|
|
|
|
def test_read_csv_parse_simple_list(self):
|
|
text = """foo
|
|
bar baz
|
|
qux foo
|
|
foo
|
|
bar"""
|
|
df = self.read_csv(StringIO(text), header=None)
|
|
expected = DataFrame({0: ['foo', 'bar baz', 'qux foo',
|
|
'foo', 'bar']})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
@tm.network
|
|
def test_url(self, datapath):
|
|
# HTTP(S)
|
|
url = ('https://raw.github.com/pandas-dev/pandas/master/'
|
|
'pandas/tests/io/parser/data/salaries.csv')
|
|
url_table = self.read_table(url)
|
|
localtable = datapath('io', 'parser', 'data', 'salaries.csv')
|
|
local_table = self.read_table(localtable)
|
|
tm.assert_frame_equal(url_table, local_table)
|
|
# TODO: ftp testing
|
|
|
|
@pytest.mark.slow
|
|
def test_file(self, datapath):
|
|
localtable = datapath('io', 'parser', 'data', 'salaries.csv')
|
|
local_table = self.read_table(localtable)
|
|
|
|
try:
|
|
url_table = self.read_table('file://localhost/' + localtable)
|
|
except URLError:
|
|
# fails on some systems
|
|
pytest.skip("failing on %s" %
|
|
' '.join(platform.uname()).strip())
|
|
|
|
tm.assert_frame_equal(url_table, local_table)
|
|
|
|
def test_path_pathlib(self):
|
|
df = tm.makeDataFrame()
|
|
result = tm.round_trip_pathlib(df.to_csv,
|
|
lambda p: self.read_csv(p, index_col=0))
|
|
tm.assert_frame_equal(df, result)
|
|
|
|
def test_path_localpath(self):
|
|
df = tm.makeDataFrame()
|
|
result = tm.round_trip_localpath(
|
|
df.to_csv,
|
|
lambda p: self.read_csv(p, index_col=0))
|
|
tm.assert_frame_equal(df, result)
|
|
|
|
def test_nonexistent_path(self):
|
|
# gh-2428: pls no segfault
|
|
# gh-14086: raise more helpful FileNotFoundError
|
|
path = '%s.csv' % tm.rands(10)
|
|
pytest.raises(compat.FileNotFoundError, self.read_csv, path)
|
|
|
|
def test_missing_trailing_delimiters(self):
|
|
data = """A,B,C,D
|
|
1,2,3,4
|
|
1,3,3,
|
|
1,4,5"""
|
|
result = self.read_csv(StringIO(data))
|
|
assert result['D'].isna()[1:].all()
|
|
|
|
def test_skipinitialspace(self):
|
|
s = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
|
|
'1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, '
|
|
'314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, '
|
|
'70.06056, 344.98370, 1, 1, -0.689265, -0.692787, '
|
|
'0.212036, 14.7674, 41.605, -9999.0, -9999.0, '
|
|
'-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128')
|
|
|
|
sfile = StringIO(s)
|
|
# it's 33 columns
|
|
result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'],
|
|
header=None, skipinitialspace=True)
|
|
assert pd.isna(result.iloc[0, 29])
|
|
|
|
def test_utf16_bom_skiprows(self):
|
|
# #2298
|
|
data = u("""skip this
|
|
skip this too
|
|
A\tB\tC
|
|
1\t2\t3
|
|
4\t5\t6""")
|
|
|
|
data2 = u("""skip this
|
|
skip this too
|
|
A,B,C
|
|
1,2,3
|
|
4,5,6""")
|
|
|
|
path = '__%s__.csv' % tm.rands(10)
|
|
|
|
with tm.ensure_clean(path) as path:
|
|
for sep, dat in [('\t', data), (',', data2)]:
|
|
for enc in ['utf-16', 'utf-16le', 'utf-16be']:
|
|
bytes = dat.encode(enc)
|
|
with open(path, 'wb') as f:
|
|
f.write(bytes)
|
|
|
|
s = BytesIO(dat.encode('utf-8'))
|
|
if compat.PY3:
|
|
# somewhat False since the code never sees bytes
|
|
from io import TextIOWrapper
|
|
s = TextIOWrapper(s, encoding='utf-8')
|
|
|
|
result = self.read_csv(path, encoding=enc, skiprows=2,
|
|
sep=sep)
|
|
expected = self.read_csv(s, encoding='utf-8', skiprows=2,
|
|
sep=sep)
|
|
s.close()
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_utf16_example(self, datapath):
|
|
path = datapath('io', 'parser', 'data', 'utf16_ex.txt')
|
|
|
|
# it works! and is the right length
|
|
result = self.read_table(path, encoding='utf-16')
|
|
assert len(result) == 50
|
|
|
|
if not compat.PY3:
|
|
buf = BytesIO(open(path, 'rb').read())
|
|
result = self.read_table(buf, encoding='utf-16')
|
|
assert len(result) == 50
|
|
|
|
def test_unicode_encoding(self, datapath):
|
|
pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
|
|
|
|
result = self.read_csv(pth, header=None, encoding='latin-1')
|
|
result = result.set_index(0)
|
|
|
|
got = result[1][1632]
|
|
expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)')
|
|
|
|
assert got == expected
|
|
|
|
def test_trailing_delimiters(self):
|
|
# #2442. grumble grumble
|
|
data = """A,B,C
|
|
1,2,3,
|
|
4,5,6,
|
|
7,8,9,"""
|
|
result = self.read_csv(StringIO(data), index_col=False)
|
|
|
|
expected = DataFrame({'A': [1, 4, 7], 'B': [2, 5, 8],
|
|
'C': [3, 6, 9]})
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_escapechar(self):
|
|
# http://stackoverflow.com/questions/13824840/feature-request-for-
|
|
# pandas-read-csv
|
|
data = '''SEARCH_TERM,ACTUAL_URL
|
|
"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
|
"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
|
|
"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa
|
|
|
|
result = self.read_csv(StringIO(data), escapechar='\\',
|
|
quotechar='"', encoding='utf-8')
|
|
assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", '
|
|
'IKEA:s 1700-tals serie')
|
|
tm.assert_index_equal(result.columns,
|
|
Index(['SEARCH_TERM', 'ACTUAL_URL']))
|
|
|
|
def test_int64_min_issues(self):
|
|
# #2599
|
|
data = 'A,B\n0,0\n0,'
|
|
|
|
result = self.read_csv(StringIO(data))
|
|
expected = DataFrame({'A': [0, 0], 'B': [0, np.nan]})
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_parse_integers_above_fp_precision(self):
|
|
data = """Numbers
|
|
17007000002000191
|
|
17007000002000191
|
|
17007000002000191
|
|
17007000002000191
|
|
17007000002000192
|
|
17007000002000192
|
|
17007000002000192
|
|
17007000002000192
|
|
17007000002000192
|
|
17007000002000194"""
|
|
|
|
result = self.read_csv(StringIO(data))
|
|
expected = DataFrame({'Numbers': [17007000002000191,
|
|
17007000002000191,
|
|
17007000002000191,
|
|
17007000002000191,
|
|
17007000002000192,
|
|
17007000002000192,
|
|
17007000002000192,
|
|
17007000002000192,
|
|
17007000002000192,
|
|
17007000002000194]})
|
|
|
|
tm.assert_series_equal(result['Numbers'], expected['Numbers'])
|
|
|
|
def test_chunks_have_consistent_numerical_type(self):
|
|
integers = [str(i) for i in range(499999)]
|
|
data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers)
|
|
|
|
with tm.assert_produces_warning(False):
|
|
df = self.read_csv(StringIO(data))
|
|
# Assert that types were coerced.
|
|
assert type(df.a[0]) is np.float64
|
|
assert df.a.dtype == np.float
|
|
|
|
def test_warn_if_chunks_have_mismatched_type(self):
|
|
warning_type = False
|
|
integers = [str(i) for i in range(499999)]
|
|
data = "a\n" + "\n".join(integers + ['a', 'b'] + integers)
|
|
|
|
# see gh-3866: if chunks are different types and can't
|
|
# be coerced using numerical types, then issue warning.
|
|
if self.engine == 'c' and self.low_memory:
|
|
warning_type = DtypeWarning
|
|
|
|
with tm.assert_produces_warning(warning_type):
|
|
df = self.read_csv(StringIO(data))
|
|
assert df.a.dtype == np.object
|
|
|
|
def test_integer_overflow_bug(self):
|
|
# see gh-2601
|
|
data = "65248E10 11\n55555E55 22\n"
|
|
|
|
result = self.read_csv(StringIO(data), header=None, sep=' ')
|
|
assert result[0].dtype == np.float64
|
|
|
|
result = self.read_csv(StringIO(data), header=None, sep=r'\s+')
|
|
assert result[0].dtype == np.float64
|
|
|
|
def test_catch_too_many_names(self):
|
|
# see gh-5156
|
|
data = """\
|
|
1,2,3
|
|
4,,6
|
|
7,8,9
|
|
10,11,12\n"""
|
|
pytest.raises(ValueError, self.read_csv, StringIO(data),
|
|
header=0, names=['a', 'b', 'c', 'd'])
|
|
|
|
def test_ignore_leading_whitespace(self):
|
|
# see gh-3374, gh-6607
|
|
data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9'
|
|
result = self.read_table(StringIO(data), sep=r'\s+')
|
|
expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_chunk_begins_with_newline_whitespace(self):
|
|
# see gh-10022
|
|
data = '\n hello\nworld\n'
|
|
result = self.read_csv(StringIO(data), header=None)
|
|
assert len(result) == 2
|
|
|
|
# see gh-9735: this issue is C parser-specific (bug when
|
|
# parsing whitespace and characters at chunk boundary)
|
|
if self.engine == 'c':
|
|
chunk1 = 'a' * (1024 * 256 - 2) + '\na'
|
|
chunk2 = '\n a'
|
|
result = self.read_csv(StringIO(chunk1 + chunk2), header=None)
|
|
expected = DataFrame(['a' * (1024 * 256 - 2), 'a', ' a'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_empty_with_index(self):
|
|
# see gh-10184
|
|
data = 'x,y'
|
|
result = self.read_csv(StringIO(data), index_col=0)
|
|
expected = DataFrame([], columns=['y'], index=Index([], name='x'))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_empty_with_multiindex(self):
|
|
# see gh-10467
|
|
data = 'x,y,z'
|
|
result = self.read_csv(StringIO(data), index_col=['x', 'y'])
|
|
expected = DataFrame([], columns=['z'],
|
|
index=MultiIndex.from_arrays(
|
|
[[]] * 2, names=['x', 'y']))
|
|
tm.assert_frame_equal(result, expected, check_index_type=False)
|
|
|
|
def test_empty_with_reversed_multiindex(self):
|
|
data = 'x,y,z'
|
|
result = self.read_csv(StringIO(data), index_col=[1, 0])
|
|
expected = DataFrame([], columns=['z'],
|
|
index=MultiIndex.from_arrays(
|
|
[[]] * 2, names=['y', 'x']))
|
|
tm.assert_frame_equal(result, expected, check_index_type=False)
|
|
|
|
def test_float_parser(self):
|
|
# see gh-9565
|
|
data = '45e-1,4.5,45.,inf,-inf'
|
|
result = self.read_csv(StringIO(data), header=None)
|
|
expected = DataFrame([[float(s) for s in data.split(',')]])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_scientific_no_exponent(self):
|
|
# see gh-12215
|
|
df = DataFrame.from_dict(OrderedDict([('w', ['2e']), ('x', ['3E']),
|
|
('y', ['42e']),
|
|
('z', ['632E'])]))
|
|
data = df.to_csv(index=False)
|
|
for prec in self.float_precision_choices:
|
|
df_roundtrip = self.read_csv(
|
|
StringIO(data), float_precision=prec)
|
|
tm.assert_frame_equal(df_roundtrip, df)
|
|
|
|
def test_int64_overflow(self):
|
|
data = """ID
|
|
00013007854817840016671868
|
|
00013007854817840016749251
|
|
00013007854817840016754630
|
|
00013007854817840016781876
|
|
00013007854817840017028824
|
|
00013007854817840017963235
|
|
00013007854817840018860166"""
|
|
|
|
# 13007854817840016671868 > UINT64_MAX, so this
|
|
# will overflow and return object as the dtype.
|
|
result = self.read_csv(StringIO(data))
|
|
assert result['ID'].dtype == object
|
|
|
|
# 13007854817840016671868 > UINT64_MAX, so attempts
|
|
# to cast to either int64 or uint64 will result in
|
|
# an OverflowError being raised.
|
|
for conv in (np.int64, np.uint64):
|
|
pytest.raises(OverflowError, self.read_csv,
|
|
StringIO(data), converters={'ID': conv})
|
|
|
|
# These numbers fall right inside the int64-uint64 range,
|
|
# so they should be parsed as string.
|
|
ui_max = np.iinfo(np.uint64).max
|
|
i_max = np.iinfo(np.int64).max
|
|
i_min = np.iinfo(np.int64).min
|
|
|
|
for x in [i_max, i_min, ui_max]:
|
|
result = self.read_csv(StringIO(str(x)), header=None)
|
|
expected = DataFrame([x])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# These numbers fall just outside the int64-uint64 range,
|
|
# so they should be parsed as string.
|
|
too_big = ui_max + 1
|
|
too_small = i_min - 1
|
|
|
|
for x in [too_big, too_small]:
|
|
result = self.read_csv(StringIO(str(x)), header=None)
|
|
expected = DataFrame([str(x)])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# No numerical dtype can hold both negative and uint64 values,
|
|
# so they should be cast as string.
|
|
data = '-1\n' + str(2**63)
|
|
expected = DataFrame([str(-1), str(2**63)])
|
|
result = self.read_csv(StringIO(data), header=None)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
data = str(2**63) + '\n-1'
|
|
expected = DataFrame([str(2**63), str(-1)])
|
|
result = self.read_csv(StringIO(data), header=None)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_empty_with_nrows_chunksize(self):
|
|
# see gh-9535
|
|
expected = DataFrame([], columns=['foo', 'bar'])
|
|
result = self.read_csv(StringIO('foo,bar\n'), nrows=10)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = next(iter(self.read_csv(
|
|
StringIO('foo,bar\n'), chunksize=10)))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_eof_states(self):
|
|
# see gh-10728, gh-10548
|
|
|
|
# With skip_blank_lines = True
|
|
expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c'])
|
|
|
|
# gh-10728: WHITESPACE_LINE
|
|
data = 'a,b,c\n4,5,6\n '
|
|
result = self.read_csv(StringIO(data))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# gh-10548: EAT_LINE_COMMENT
|
|
data = 'a,b,c\n4,5,6\n#comment'
|
|
result = self.read_csv(StringIO(data), comment='#')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# EAT_CRNL_NOP
|
|
data = 'a,b,c\n4,5,6\n\r'
|
|
result = self.read_csv(StringIO(data))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# EAT_COMMENT
|
|
data = 'a,b,c\n4,5,6#comment'
|
|
result = self.read_csv(StringIO(data), comment='#')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# SKIP_LINE
|
|
data = 'a,b,c\n4,5,6\nskipme'
|
|
result = self.read_csv(StringIO(data), skiprows=[2])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# With skip_blank_lines = False
|
|
|
|
# EAT_LINE_COMMENT
|
|
data = 'a,b,c\n4,5,6\n#comment'
|
|
result = self.read_csv(
|
|
StringIO(data), comment='#', skip_blank_lines=False)
|
|
expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# IN_FIELD
|
|
data = 'a,b,c\n4,5,6\n '
|
|
result = self.read_csv(StringIO(data), skip_blank_lines=False)
|
|
expected = DataFrame(
|
|
[['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# EAT_CRNL
|
|
data = 'a,b,c\n4,5,6\n\r'
|
|
result = self.read_csv(StringIO(data), skip_blank_lines=False)
|
|
expected = DataFrame(
|
|
[[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Should produce exceptions
|
|
|
|
# ESCAPED_CHAR
|
|
data = "a,b,c\n4,5,6\n\\"
|
|
pytest.raises(Exception, self.read_csv,
|
|
StringIO(data), escapechar='\\')
|
|
|
|
# ESCAPE_IN_QUOTED_FIELD
|
|
data = 'a,b,c\n4,5,6\n"\\'
|
|
pytest.raises(Exception, self.read_csv,
|
|
StringIO(data), escapechar='\\')
|
|
|
|
# IN_QUOTED_FIELD
|
|
data = 'a,b,c\n4,5,6\n"'
|
|
pytest.raises(Exception, self.read_csv,
|
|
StringIO(data), escapechar='\\')
|
|
|
|
def test_uneven_lines_with_usecols(self):
|
|
# See gh-12203
|
|
csv = r"""a,b,c
|
|
0,1,2
|
|
3,4,5,6,7
|
|
8,9,10
|
|
"""
|
|
|
|
# make sure that an error is still thrown
|
|
# when the 'usecols' parameter is not provided
|
|
msg = r"Expected \d+ fields in line \d+, saw \d+"
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
df = self.read_csv(StringIO(csv))
|
|
|
|
expected = DataFrame({
|
|
'a': [0, 3, 8],
|
|
'b': [1, 4, 9]
|
|
})
|
|
|
|
usecols = [0, 1]
|
|
df = self.read_csv(StringIO(csv), usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
usecols = ['a', 'b']
|
|
df = self.read_csv(StringIO(csv), usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_read_empty_with_usecols(self):
|
|
# See gh-12493
|
|
names = ['Dummy', 'X', 'Dummy_2']
|
|
usecols = names[1:2] # ['X']
|
|
|
|
# first, check to see that the response of
|
|
# parser when faced with no provided columns
|
|
# throws the correct error, with or without usecols
|
|
errmsg = "No columns to parse from file"
|
|
|
|
with tm.assert_raises_regex(EmptyDataError, errmsg):
|
|
self.read_csv(StringIO(''))
|
|
|
|
with tm.assert_raises_regex(EmptyDataError, errmsg):
|
|
self.read_csv(StringIO(''), usecols=usecols)
|
|
|
|
expected = DataFrame(columns=usecols, index=[0], dtype=np.float64)
|
|
df = self.read_csv(StringIO(',,'), names=names, usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
expected = DataFrame(columns=usecols)
|
|
df = self.read_csv(StringIO(''), names=names, usecols=usecols)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_trailing_spaces(self):
|
|
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa
|
|
expected = DataFrame([[1., 2., 4.],
|
|
[5.1, np.nan, 10.]])
|
|
|
|
# gh-8661, gh-8679: this should ignore six lines including
|
|
# lines with trailing whitespace and blank lines
|
|
df = self.read_csv(StringIO(data.replace(',', ' ')),
|
|
header=None, delim_whitespace=True,
|
|
skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True)
|
|
tm.assert_frame_equal(df, expected)
|
|
df = self.read_table(StringIO(data.replace(',', ' ')),
|
|
header=None, delim_whitespace=True,
|
|
skiprows=[0, 1, 2, 3, 5, 6],
|
|
skip_blank_lines=True)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# gh-8983: test skipping set of rows after a row with trailing spaces
|
|
expected = DataFrame({"A": [1., 5.1], "B": [2., np.nan],
|
|
"C": [4., 10]})
|
|
df = self.read_table(StringIO(data.replace(',', ' ')),
|
|
delim_whitespace=True,
|
|
skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_raise_on_sep_with_delim_whitespace(self):
|
|
# see gh-6607
|
|
data = 'a b c\n1 2 3'
|
|
with tm.assert_raises_regex(ValueError,
|
|
'you can only specify one'):
|
|
self.read_table(StringIO(data), sep=r'\s', delim_whitespace=True)
|
|
|
|
def test_single_char_leading_whitespace(self):
|
|
# see gh-9710
|
|
data = """\
|
|
MyColumn
|
|
a
|
|
b
|
|
a
|
|
b\n"""
|
|
|
|
expected = DataFrame({'MyColumn': list('abab')})
|
|
|
|
result = self.read_csv(StringIO(data), delim_whitespace=True,
|
|
skipinitialspace=True)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = self.read_csv(StringIO(data), skipinitialspace=True)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_empty_lines(self):
|
|
data = """\
|
|
A,B,C
|
|
1,2.,4.
|
|
|
|
|
|
5.,NaN,10.0
|
|
|
|
-70,.4,1
|
|
"""
|
|
expected = np.array([[1., 2., 4.],
|
|
[5., np.nan, 10.],
|
|
[-70., .4, 1.]])
|
|
df = self.read_csv(StringIO(data))
|
|
tm.assert_numpy_array_equal(df.values, expected)
|
|
df = self.read_csv(StringIO(data.replace(',', ' ')), sep=r'\s+')
|
|
tm.assert_numpy_array_equal(df.values, expected)
|
|
expected = np.array([[1., 2., 4.],
|
|
[np.nan, np.nan, np.nan],
|
|
[np.nan, np.nan, np.nan],
|
|
[5., np.nan, 10.],
|
|
[np.nan, np.nan, np.nan],
|
|
[-70., .4, 1.]])
|
|
df = self.read_csv(StringIO(data), skip_blank_lines=False)
|
|
tm.assert_numpy_array_equal(df.values, expected)
|
|
|
|
def test_whitespace_lines(self):
|
|
data = """
|
|
|
|
\t \t\t
|
|
\t
|
|
A,B,C
|
|
\t 1,2.,4.
|
|
5.,NaN,10.0
|
|
"""
|
|
expected = np.array([[1, 2., 4.],
|
|
[5., np.nan, 10.]])
|
|
df = self.read_csv(StringIO(data))
|
|
tm.assert_numpy_array_equal(df.values, expected)
|
|
|
|
def test_regex_separator(self):
|
|
# see gh-6607
|
|
data = """ A B C D
|
|
a 1 2 3 4
|
|
b 1 2 3 4
|
|
c 1 2 3 4
|
|
"""
|
|
df = self.read_table(StringIO(data), sep=r'\s+')
|
|
expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)),
|
|
index_col=0)
|
|
assert expected.index.name is None
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9'
|
|
result = self.read_table(StringIO(data), sep=r'\s+')
|
|
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
|
|
columns=['a', 'b', 'c'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@tm.capture_stdout
|
|
def test_verbose_import(self):
|
|
text = """a,b,c,d
|
|
one,1,2,3
|
|
one,1,2,3
|
|
,1,2,3
|
|
one,1,2,3
|
|
,1,2,3
|
|
,1,2,3
|
|
one,1,2,3
|
|
two,1,2,3"""
|
|
|
|
# Engines are verbose in different ways.
|
|
self.read_csv(StringIO(text), verbose=True)
|
|
output = sys.stdout.getvalue()
|
|
|
|
if self.engine == 'c':
|
|
assert 'Tokenization took:' in output
|
|
assert 'Parser memory cleanup took:' in output
|
|
else: # Python engine
|
|
assert output == 'Filled 3 NA values in column a\n'
|
|
|
|
# Reset the stdout buffer.
|
|
sys.stdout = StringIO()
|
|
|
|
text = """a,b,c,d
|
|
one,1,2,3
|
|
two,1,2,3
|
|
three,1,2,3
|
|
four,1,2,3
|
|
five,1,2,3
|
|
,1,2,3
|
|
seven,1,2,3
|
|
eight,1,2,3"""
|
|
|
|
self.read_csv(StringIO(text), verbose=True, index_col=0)
|
|
output = sys.stdout.getvalue()
|
|
|
|
# Engines are verbose in different ways.
|
|
if self.engine == 'c':
|
|
assert 'Tokenization took:' in output
|
|
assert 'Parser memory cleanup took:' in output
|
|
else: # Python engine
|
|
assert output == 'Filled 1 NA values in column a\n'
|
|
|
|
@pytest.mark.skipif(PY3, reason="won't work in Python 3")
|
|
def test_iteration_open_handle(self):
|
|
|
|
with tm.ensure_clean() as path:
|
|
with open(path, 'wb') as f:
|
|
f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG')
|
|
|
|
with open(path, 'rb') as f:
|
|
for line in f:
|
|
if 'CCC' in line:
|
|
break
|
|
|
|
if self.engine == 'c':
|
|
pytest.raises(Exception, self.read_table,
|
|
f, squeeze=True, header=None)
|
|
else:
|
|
result = self.read_table(f, squeeze=True, header=None)
|
|
expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_1000_sep_with_decimal(self):
|
|
data = """A|B|C
|
|
1|2,334.01|5
|
|
10|13|10.
|
|
"""
|
|
expected = DataFrame({
|
|
'A': [1, 10],
|
|
'B': [2334.01, 13],
|
|
'C': [5, 10.]
|
|
})
|
|
|
|
assert expected.A.dtype == 'int64'
|
|
assert expected.B.dtype == 'float'
|
|
assert expected.C.dtype == 'float'
|
|
|
|
df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_table(StringIO(data), sep='|',
|
|
thousands=',', decimal='.')
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
data_with_odd_sep = """A|B|C
|
|
1|2.334,01|5
|
|
10|13|10,
|
|
"""
|
|
df = self.read_csv(StringIO(data_with_odd_sep),
|
|
sep='|', thousands='.', decimal=',')
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = self.read_table(StringIO(data_with_odd_sep),
|
|
sep='|', thousands='.', decimal=',')
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_euro_decimal_format(self):
|
|
data = """Id;Number1;Number2;Text1;Text2;Number3
|
|
1;1521,1541;187101,9543;ABC;poi;4,738797819
|
|
2;121,12;14897,76;DEF;uyt;0,377320872
|
|
3;878,158;108013,434;GHI;rez;2,735694704"""
|
|
|
|
df2 = self.read_csv(StringIO(data), sep=';', decimal=',')
|
|
assert df2['Number1'].dtype == float
|
|
assert df2['Number2'].dtype == float
|
|
assert df2['Number3'].dtype == float
|
|
|
|
def test_inf_parsing(self):
|
|
data = """\
|
|
,A
|
|
a,inf
|
|
b,-inf
|
|
c,+Inf
|
|
d,-Inf
|
|
e,INF
|
|
f,-INF
|
|
g,+INf
|
|
h,-INf
|
|
i,inF
|
|
j,-inF"""
|
|
inf = float('inf')
|
|
expected = Series([inf, -inf] * 5)
|
|
|
|
df = self.read_csv(StringIO(data), index_col=0)
|
|
tm.assert_almost_equal(df['A'].values, expected.values)
|
|
|
|
df = self.read_csv(StringIO(data), index_col=0, na_filter=False)
|
|
tm.assert_almost_equal(df['A'].values, expected.values)
|
|
|
|
def test_raise_on_no_columns(self):
|
|
# single newline
|
|
data = "\n"
|
|
pytest.raises(EmptyDataError, self.read_csv, StringIO(data))
|
|
|
|
# test with more than a single newline
|
|
data = "\n\n\n"
|
|
pytest.raises(EmptyDataError, self.read_csv, StringIO(data))
|
|
|
|
def test_memory_map(self):
|
|
mmap_file = os.path.join(self.dirpath, 'test_mmap.csv')
|
|
expected = DataFrame({
|
|
'a': [1, 2, 3],
|
|
'b': ['one', 'two', 'three'],
|
|
'c': ['I', 'II', 'III']
|
|
})
|
|
|
|
out = self.read_csv(mmap_file, memory_map=True)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_null_byte_char(self):
|
|
# see gh-2741
|
|
data = '\x00,foo'
|
|
cols = ['a', 'b']
|
|
|
|
expected = DataFrame([[np.nan, 'foo']],
|
|
columns=cols)
|
|
|
|
if self.engine == 'c':
|
|
out = self.read_csv(StringIO(data), names=cols)
|
|
tm.assert_frame_equal(out, expected)
|
|
else:
|
|
msg = "NULL byte detected"
|
|
with tm.assert_raises_regex(ParserError, msg):
|
|
self.read_csv(StringIO(data), names=cols)
|
|
|
|
def test_utf8_bom(self):
|
|
# see gh-4793
|
|
bom = u('\ufeff')
|
|
utf8 = 'utf-8'
|
|
|
|
def _encode_data_with_bom(_data):
|
|
bom_data = (bom + _data).encode(utf8)
|
|
return BytesIO(bom_data)
|
|
|
|
# basic test
|
|
data = 'a\n1'
|
|
expected = DataFrame({'a': [1]})
|
|
|
|
out = self.read_csv(_encode_data_with_bom(data),
|
|
encoding=utf8)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
# test with "regular" quoting
|
|
data = '"a"\n1'
|
|
expected = DataFrame({'a': [1]})
|
|
|
|
out = self.read_csv(_encode_data_with_bom(data),
|
|
encoding=utf8, quotechar='"')
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
# test in a data row instead of header
|
|
data = 'b\n1'
|
|
expected = DataFrame({'a': ['b', '1']})
|
|
|
|
out = self.read_csv(_encode_data_with_bom(data),
|
|
encoding=utf8, names=['a'])
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
# test in empty data row with skipping
|
|
data = '\n1'
|
|
expected = DataFrame({'a': [1]})
|
|
|
|
out = self.read_csv(_encode_data_with_bom(data),
|
|
encoding=utf8, names=['a'],
|
|
skip_blank_lines=True)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
# test in empty data row without skipping
|
|
data = '\n1'
|
|
expected = DataFrame({'a': [np.nan, 1.0]})
|
|
|
|
out = self.read_csv(_encode_data_with_bom(data),
|
|
encoding=utf8, names=['a'],
|
|
skip_blank_lines=False)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
def test_temporary_file(self):
|
|
# see gh-13398
|
|
data1 = "0 0"
|
|
|
|
from tempfile import TemporaryFile
|
|
new_file = TemporaryFile("w+")
|
|
new_file.write(data1)
|
|
new_file.flush()
|
|
new_file.seek(0)
|
|
|
|
result = self.read_csv(new_file, sep=r'\s+', header=None)
|
|
new_file.close()
|
|
expected = DataFrame([[0, 0]])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_read_csv_utf_aliases(self):
|
|
# see gh issue 13549
|
|
expected = pd.DataFrame({'mb_num': [4.8], 'multibyte': ['test']})
|
|
for byte in [8, 16]:
|
|
for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']:
|
|
encoding = fmt.format(byte)
|
|
data = 'mb_num,multibyte\n4.8,test'.encode(encoding)
|
|
result = self.read_csv(BytesIO(data), encoding=encoding)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_internal_eof_byte(self):
|
|
# see gh-5500
|
|
data = "a,b\n1\x1a,2"
|
|
|
|
expected = pd.DataFrame([["1\x1a", 2]], columns=['a', 'b'])
|
|
result = self.read_csv(StringIO(data))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_internal_eof_byte_to_file(self):
|
|
# see gh-16559
|
|
data = b'c1,c2\r\n"test \x1a test", test\r\n'
|
|
expected = pd.DataFrame([["test \x1a test", " test"]],
|
|
columns=["c1", "c2"])
|
|
|
|
path = '__%s__.csv' % tm.rands(10)
|
|
|
|
with tm.ensure_clean(path) as path:
|
|
with open(path, "wb") as f:
|
|
f.write(data)
|
|
|
|
result = self.read_csv(path)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_sub_character(self, datapath):
|
|
# see gh-16893
|
|
filename = datapath('io', 'parser', 'data', 'sub_char.csv')
|
|
|
|
expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
|
|
result = self.read_csv(filename)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_file_handles(self):
|
|
# GH 14418 - don't close user provided file handles
|
|
|
|
fh = StringIO('a,b\n1,2')
|
|
self.read_csv(fh)
|
|
assert not fh.closed
|
|
|
|
with open(self.csv1, 'r') as f:
|
|
self.read_csv(f)
|
|
assert not f.closed
|
|
|
|
# mmap not working with python engine
|
|
if self.engine != 'python':
|
|
|
|
import mmap
|
|
with open(self.csv1, 'r') as f:
|
|
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
|
self.read_csv(m)
|
|
# closed attribute new in python 3.2
|
|
if PY3:
|
|
assert not m.closed
|
|
m.close()
|
|
|
|
def test_invalid_file_buffer(self):
|
|
# see gh-15337
|
|
|
|
class InvalidBuffer(object):
|
|
pass
|
|
|
|
msg = "Invalid file path or buffer object type"
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(InvalidBuffer())
|
|
|
|
# gh-16135: we want to ensure that "tell" and "seek"
|
|
# aren't actually being used when we call `read_csv`
|
|
#
|
|
# Thus, while the object may look "invalid" (these
|
|
# methods are attributes of the `StringIO` class),
|
|
# it is still a valid file-object for our purposes.
|
|
class NoSeekTellBuffer(StringIO):
|
|
def tell(self):
|
|
raise AttributeError("No tell method")
|
|
|
|
def seek(self, pos, whence=0):
|
|
raise AttributeError("No seek method")
|
|
|
|
data = "a\n1"
|
|
|
|
expected = pd.DataFrame({"a": [1]})
|
|
result = self.read_csv(NoSeekTellBuffer(data))
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
if PY3:
|
|
from unittest import mock
|
|
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.read_csv(mock.Mock())
|
|
|
|
@tm.capture_stderr
|
|
def test_skip_bad_lines(self):
|
|
# see gh-15925
|
|
data = 'a\n1\n1,2,3\n4\n5,6,7'
|
|
|
|
with pytest.raises(ParserError):
|
|
self.read_csv(StringIO(data))
|
|
|
|
with pytest.raises(ParserError):
|
|
self.read_csv(StringIO(data), error_bad_lines=True)
|
|
|
|
expected = DataFrame({'a': [1, 4]})
|
|
|
|
out = self.read_csv(StringIO(data),
|
|
error_bad_lines=False,
|
|
warn_bad_lines=False)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
val = sys.stderr.getvalue()
|
|
assert val == ''
|
|
|
|
# Reset the stderr buffer.
|
|
sys.stderr = StringIO()
|
|
|
|
out = self.read_csv(StringIO(data),
|
|
error_bad_lines=False,
|
|
warn_bad_lines=True)
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
val = sys.stderr.getvalue()
|
|
assert 'Skipping line 3' in val
|
|
assert 'Skipping line 5' in val
|