170 lines
5.6 KiB
Python
170 lines
5.6 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
import pytest
|
||
|
import pandas as pd
|
||
|
from pandas import DataFrame, read_json
|
||
|
from pandas.compat import StringIO
|
||
|
from pandas.io.json.json import JsonReader
|
||
|
import pandas.util.testing as tm
|
||
|
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
|
||
|
ensure_clean)
|
||
|
|
||
|
|
||
|
@pytest.fixture
|
||
|
def lines_json_df():
|
||
|
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||
|
return df.to_json(lines=True, orient="records")
|
||
|
|
||
|
|
||
|
def test_read_jsonl():
|
||
|
# GH9180
|
||
|
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
|
||
|
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_read_jsonl_unicode_chars():
|
||
|
# GH15132: non-ascii unicode characters
|
||
|
# \u201d == RIGHT DOUBLE QUOTATION MARK
|
||
|
|
||
|
# simulate file handle
|
||
|
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||
|
json = StringIO(json)
|
||
|
result = read_json(json, lines=True)
|
||
|
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||
|
columns=['a', 'b'])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# simulate string
|
||
|
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||
|
result = read_json(json, lines=True)
|
||
|
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
|
||
|
columns=['a', 'b'])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_to_jsonl():
|
||
|
# GH9180
|
||
|
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
|
||
|
result = df.to_json(orient="records", lines=True)
|
||
|
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
|
||
|
assert result == expected
|
||
|
|
||
|
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
|
||
|
result = df.to_json(orient="records", lines=True)
|
||
|
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
|
||
|
assert result == expected
|
||
|
assert_frame_equal(read_json(result, lines=True), df)
|
||
|
|
||
|
# GH15096: escaped characters in columns and data
|
||
|
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
|
||
|
columns=["a\\", 'b'])
|
||
|
result = df.to_json(orient="records", lines=True)
|
||
|
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
|
||
|
'{"a\\\\":"foo\\"","b":"bar"}')
|
||
|
assert result == expected
|
||
|
assert_frame_equal(read_json(result, lines=True), df)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("chunksize", [1, 1.0])
|
||
|
def test_readjson_chunks(lines_json_df, chunksize):
|
||
|
# Basic test that read_json(chunks=True) gives the same result as
|
||
|
# read_json(chunks=False)
|
||
|
# GH17048: memory usage when lines=True
|
||
|
|
||
|
unchunked = read_json(StringIO(lines_json_df), lines=True)
|
||
|
reader = read_json(StringIO(lines_json_df), lines=True,
|
||
|
chunksize=chunksize)
|
||
|
chunked = pd.concat(reader)
|
||
|
|
||
|
assert_frame_equal(chunked, unchunked)
|
||
|
|
||
|
|
||
|
def test_readjson_chunksize_requires_lines(lines_json_df):
|
||
|
msg = "chunksize can only be passed if lines=True"
|
||
|
with tm.assert_raises_regex(ValueError, msg):
|
||
|
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
|
||
|
|
||
|
|
||
|
def test_readjson_chunks_series():
|
||
|
# Test reading line-format JSON to Series with chunksize param
|
||
|
s = pd.Series({'A': 1, 'B': 2})
|
||
|
|
||
|
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||
|
unchunked = pd.read_json(strio, lines=True, typ='Series')
|
||
|
|
||
|
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||
|
chunked = pd.concat(pd.read_json(
|
||
|
strio, lines=True, typ='Series', chunksize=1
|
||
|
))
|
||
|
|
||
|
assert_series_equal(chunked, unchunked)
|
||
|
|
||
|
|
||
|
def test_readjson_each_chunk(lines_json_df):
|
||
|
# Other tests check that the final result of read_json(chunksize=True)
|
||
|
# is correct. This checks the intermediate chunks.
|
||
|
chunks = list(
|
||
|
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
|
||
|
)
|
||
|
assert chunks[0].shape == (2, 2)
|
||
|
assert chunks[1].shape == (1, 2)
|
||
|
|
||
|
|
||
|
def test_readjson_chunks_from_file():
|
||
|
with ensure_clean('test.json') as path:
|
||
|
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||
|
df.to_json(path, lines=True, orient="records")
|
||
|
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
|
||
|
unchunked = pd.read_json(path, lines=True)
|
||
|
assert_frame_equal(unchunked, chunked)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("chunksize", [None, 1])
|
||
|
def test_readjson_chunks_closes(chunksize):
|
||
|
with ensure_clean('test.json') as path:
|
||
|
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||
|
df.to_json(path, lines=True, orient="records")
|
||
|
reader = JsonReader(
|
||
|
path, orient=None, typ="frame", dtype=True, convert_axes=True,
|
||
|
convert_dates=True, keep_default_dates=True, numpy=False,
|
||
|
precise_float=False, date_unit=None, encoding=None,
|
||
|
lines=True, chunksize=chunksize, compression=None)
|
||
|
reader.read()
|
||
|
assert reader.open_stream.closed, "didn't close stream with \
|
||
|
chunksize = {chunksize}".format(chunksize=chunksize)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
|
||
|
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
|
||
|
msg = r"'chunksize' must be an integer >=1"
|
||
|
|
||
|
with tm.assert_raises_regex(ValueError, msg):
|
||
|
pd.read_json(StringIO(lines_json_df), lines=True,
|
||
|
chunksize=chunksize)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize("chunksize", [None, 1, 2])
|
||
|
def test_readjson_chunks_multiple_empty_lines(chunksize):
|
||
|
j = """
|
||
|
|
||
|
{"A":1,"B":4}
|
||
|
|
||
|
|
||
|
|
||
|
{"A":2,"B":5}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
{"A":3,"B":6}
|
||
|
"""
|
||
|
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||
|
test = pd.read_json(j, lines=True, chunksize=chunksize)
|
||
|
if chunksize is not None:
|
||
|
test = pd.concat(test)
|
||
|
tm.assert_frame_equal(
|
||
|
orig, test, obj="chunksize: {chunksize}".format(chunksize=chunksize))
|