laywerrobot/lib/python3.6/site-packages/pandas/tests/io/json/test_pandas.py

1227 lines
49 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# -*- coding: utf-8 -*-
# pylint: disable-msg=W0612,E1101
import pytest
from pandas.compat import (range, lrange, StringIO,
OrderedDict, is_platform_32bit)
import os
import numpy as np
from pandas import (Series, DataFrame, DatetimeIndex, Timestamp,
read_json, compat)
from datetime import timedelta
import pandas as pd
import json
from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
assert_series_equal, network,
ensure_clean, assert_index_equal)
import pandas.util.testing as tm
_seriesd = tm.getSeriesData()
_tsd = tm.getTimeSeriesData()
_frame = DataFrame(_seriesd)
_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
_intframe = DataFrame(dict((k, v.astype(np.int64))
for k, v in compat.iteritems(_seriesd)))
_tsframe = DataFrame(_tsd)
_cat_frame = _frame.copy()
cat = ['bah'] * 5 + ['bar'] * 5 + ['baz'] * \
5 + ['foo'] * (len(_cat_frame) - 15)
_cat_frame.index = pd.CategoricalIndex(cat, name='E')
_cat_frame['E'] = list(reversed(cat))
_cat_frame['sort'] = np.arange(len(_cat_frame), dtype='int64')
_mixed_frame = _frame.copy()
class TestPandasContainer(object):
@pytest.fixture(scope="function", autouse=True)
def setup(self, datapath):
self.dirpath = datapath("io", "json", "data")
self.ts = tm.makeTimeSeries()
self.ts.name = 'ts'
self.series = tm.makeStringSeries()
self.series.name = 'series'
self.objSeries = tm.makeObjectSeries()
self.objSeries.name = 'objects'
self.empty_series = Series([], index=[])
self.empty_frame = DataFrame({})
self.frame = _frame.copy()
self.frame2 = _frame2.copy()
self.intframe = _intframe.copy()
self.tsframe = _tsframe.copy()
self.mixed_frame = _mixed_frame.copy()
self.categorical = _cat_frame.copy()
yield
del self.dirpath
del self.ts
del self.series
del self.objSeries
del self.empty_series
del self.empty_frame
del self.frame
del self.frame2
del self.intframe
del self.tsframe
del self.mixed_frame
def test_frame_double_encoded_labels(self):
df = DataFrame([['a', 'b'], ['c', 'd']],
index=['index " 1', 'index / 2'],
columns=['a \\ b', 'y / z'])
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split'))
assert_frame_equal(df, read_json(df.to_json(orient='columns'),
orient='columns'))
assert_frame_equal(df, read_json(df.to_json(orient='index'),
orient='index'))
df_unser = read_json(df.to_json(orient='records'), orient='records')
assert_index_equal(df.columns, df_unser.columns)
tm.assert_numpy_array_equal(df.values, df_unser.values)
def test_frame_non_unique_index(self):
df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1],
columns=['x', 'y'])
pytest.raises(ValueError, df.to_json, orient='index')
pytest.raises(ValueError, df.to_json, orient='columns')
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split'))
unser = read_json(df.to_json(orient='records'), orient='records')
tm.assert_index_equal(df.columns, unser.columns)
tm.assert_almost_equal(df.values, unser.values)
unser = read_json(df.to_json(orient='values'), orient='values')
tm.assert_numpy_array_equal(df.values, unser.values)
def test_frame_non_unique_columns(self):
df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2],
columns=['x', 'x'])
pytest.raises(ValueError, df.to_json, orient='index')
pytest.raises(ValueError, df.to_json, orient='columns')
pytest.raises(ValueError, df.to_json, orient='records')
assert_frame_equal(df, read_json(df.to_json(orient='split'),
orient='split', dtype=False))
unser = read_json(df.to_json(orient='values'), orient='values')
tm.assert_numpy_array_equal(df.values, unser.values)
# GH4377; duplicate columns not processing correctly
df = DataFrame([['a', 'b'], ['c', 'd']], index=[
1, 2], columns=['x', 'y'])
result = read_json(df.to_json(orient='split'), orient='split')
assert_frame_equal(result, df)
def _check(df):
result = read_json(df.to_json(orient='split'), orient='split',
convert_dates=['x'])
assert_frame_equal(result, df)
for o in [[['a', 'b'], ['c', 'd']],
[[1.5, 2.5], [3.5, 4.5]],
[[1, 2.5], [3, 4.5]],
[[Timestamp('20130101'), 3.5],
[Timestamp('20130102'), 4.5]]]:
_check(DataFrame(o, index=[1, 2], columns=['x', 'x']))
def test_frame_from_json_to_json(self):
def _check_orient(df, orient, dtype=None, numpy=False,
convert_axes=True, check_dtype=True, raise_ok=None,
sort=None, check_index_type=True,
check_column_type=True, check_numpy_dtype=False):
if sort is not None:
df = df.sort_values(sort)
else:
df = df.sort_index()
# if we are not unique, then check that we are raising ValueError
# for the appropriate orients
if not df.index.is_unique and orient in ['index', 'columns']:
pytest.raises(
ValueError, lambda: df.to_json(orient=orient))
return
if (not df.columns.is_unique and
orient in ['index', 'columns', 'records']):
pytest.raises(
ValueError, lambda: df.to_json(orient=orient))
return
dfjson = df.to_json(orient=orient)
try:
unser = read_json(dfjson, orient=orient, dtype=dtype,
numpy=numpy, convert_axes=convert_axes)
except Exception as detail:
if raise_ok is not None:
if isinstance(detail, raise_ok):
return
raise
if sort is not None and sort in unser.columns:
unser = unser.sort_values(sort)
else:
unser = unser.sort_index()
if dtype is False:
check_dtype = False
if not convert_axes and df.index.dtype.type == np.datetime64:
unser.index = DatetimeIndex(
unser.index.values.astype('i8') * 1e6)
if orient == "records":
# index is not captured in this orientation
tm.assert_almost_equal(df.values, unser.values,
check_dtype=check_numpy_dtype)
tm.assert_index_equal(df.columns, unser.columns,
exact=check_column_type)
elif orient == "values":
# index and cols are not captured in this orientation
if numpy is True and df.shape == (0, 0):
assert unser.shape[0] == 0
else:
tm.assert_almost_equal(df.values, unser.values,
check_dtype=check_numpy_dtype)
elif orient == "split":
# index and col labels might not be strings
unser.index = [str(i) for i in unser.index]
unser.columns = [str(i) for i in unser.columns]
if sort is None:
unser = unser.sort_index()
tm.assert_almost_equal(df.values, unser.values,
check_dtype=check_numpy_dtype)
else:
if convert_axes:
tm.assert_frame_equal(df, unser, check_dtype=check_dtype,
check_index_type=check_index_type,
check_column_type=check_column_type)
else:
tm.assert_frame_equal(df, unser, check_less_precise=False,
check_dtype=check_dtype)
def _check_all_orients(df, dtype=None, convert_axes=True,
raise_ok=None, sort=None, check_index_type=True,
check_column_type=True):
# numpy=False
if convert_axes:
_check_orient(df, "columns", dtype=dtype, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "records", dtype=dtype, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "split", dtype=dtype, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "index", dtype=dtype, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "values", dtype=dtype, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "columns", dtype=dtype,
convert_axes=False, sort=sort)
_check_orient(df, "records", dtype=dtype,
convert_axes=False, sort=sort)
_check_orient(df, "split", dtype=dtype,
convert_axes=False, sort=sort)
_check_orient(df, "index", dtype=dtype,
convert_axes=False, sort=sort)
_check_orient(df, "values", dtype=dtype,
convert_axes=False, sort=sort)
# numpy=True and raise_ok might be not None, so ignore the error
if convert_axes:
_check_orient(df, "columns", dtype=dtype, numpy=True,
raise_ok=raise_ok, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "records", dtype=dtype, numpy=True,
raise_ok=raise_ok, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "split", dtype=dtype, numpy=True,
raise_ok=raise_ok, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "index", dtype=dtype, numpy=True,
raise_ok=raise_ok, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "values", dtype=dtype, numpy=True,
raise_ok=raise_ok, sort=sort,
check_index_type=False, check_column_type=False)
_check_orient(df, "columns", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "records", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "split", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "index", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok, sort=sort)
_check_orient(df, "values", dtype=dtype, numpy=True,
convert_axes=False, raise_ok=raise_ok, sort=sort)
# basic
_check_all_orients(self.frame)
assert self.frame.to_json() == self.frame.to_json(orient="columns")
_check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
_check_all_orients(self.intframe, dtype=False)
# big one
# index and columns are strings as all unserialised JSON object keys
# are assumed to be strings
biggie = DataFrame(np.zeros((200, 4)),
columns=[str(i) for i in range(4)],
index=[str(i) for i in range(200)])
_check_all_orients(biggie, dtype=False, convert_axes=False)
# dtypes
_check_all_orients(DataFrame(biggie, dtype=np.float64),
dtype=np.float64, convert_axes=False)
_check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int,
convert_axes=False)
_check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
convert_axes=False, raise_ok=ValueError)
# categorical
_check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)
# empty
_check_all_orients(self.empty_frame, check_index_type=False,
check_column_type=False)
# time series data
_check_all_orients(self.tsframe)
# mixed data
index = pd.Index(['a', 'b', 'c', 'd', 'e'])
data = {'A': [0., 1., 2., 3., 4.],
'B': [0., 1., 0., 1., 0.],
'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
'D': [True, False, True, False, True]}
df = DataFrame(data=data, index=index)
_check_orient(df, "split", check_dtype=False)
_check_orient(df, "records", check_dtype=False)
_check_orient(df, "values", check_dtype=False)
_check_orient(df, "columns", check_dtype=False)
# index oriented is problematic as it is read back in in a transposed
# state, so the columns are interpreted as having mixed data and
# given object dtypes.
# force everything to have object dtype beforehand
_check_orient(df.transpose().transpose(), "index", dtype=False)
def test_frame_from_json_bad_data(self):
pytest.raises(ValueError, read_json, StringIO('{"key":b:a:d}'))
# too few indices
json = StringIO('{"columns":["A","B"],'
'"index":["2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
pytest.raises(ValueError, read_json, json,
orient="split")
# too many columns
json = StringIO('{"columns":["A","B","C"],'
'"index":["1","2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
pytest.raises(AssertionError, read_json, json,
orient="split")
# bad key
json = StringIO('{"badkey":["A","B"],'
'"index":["2","3"],'
'"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
with tm.assert_raises_regex(ValueError,
r"unexpected key\(s\): badkey"):
read_json(json, orient="split")
def test_frame_from_json_nones(self):
df = DataFrame([[1, 2], [4, 5, 6]])
unser = read_json(df.to_json())
assert np.isnan(unser[2][0])
df = DataFrame([['1', '2'], ['4', '5', '6']])
unser = read_json(df.to_json())
assert np.isnan(unser[2][0])
unser = read_json(df.to_json(), dtype=False)
assert unser[2][0] is None
unser = read_json(df.to_json(), convert_axes=False, dtype=False)
assert unser['2']['0'] is None
unser = read_json(df.to_json(), numpy=False)
assert np.isnan(unser[2][0])
unser = read_json(df.to_json(), numpy=False, dtype=False)
assert unser[2][0] is None
unser = read_json(df.to_json(), numpy=False,
convert_axes=False, dtype=False)
assert unser['2']['0'] is None
# infinities get mapped to nulls which get mapped to NaNs during
# deserialisation
df = DataFrame([[1, 2], [4, 5, 6]])
df.loc[0, 2] = np.inf
unser = read_json(df.to_json())
assert np.isnan(unser[2][0])
unser = read_json(df.to_json(), dtype=False)
assert np.isnan(unser[2][0])
df.loc[0, 2] = np.NINF
unser = read_json(df.to_json())
assert np.isnan(unser[2][0])
unser = read_json(df.to_json(), dtype=False)
assert np.isnan(unser[2][0])
@pytest.mark.skipif(is_platform_32bit(),
reason="not compliant on 32-bit, xref #15865")
def test_frame_to_json_float_precision(self):
df = pd.DataFrame([dict(a_float=0.95)])
encoded = df.to_json(double_precision=1)
assert encoded == '{"a_float":{"0":1.0}}'
df = pd.DataFrame([dict(a_float=1.95)])
encoded = df.to_json(double_precision=1)
assert encoded == '{"a_float":{"0":2.0}}'
df = pd.DataFrame([dict(a_float=-1.95)])
encoded = df.to_json(double_precision=1)
assert encoded == '{"a_float":{"0":-2.0}}'
df = pd.DataFrame([dict(a_float=0.995)])
encoded = df.to_json(double_precision=2)
assert encoded == '{"a_float":{"0":1.0}}'
df = pd.DataFrame([dict(a_float=0.9995)])
encoded = df.to_json(double_precision=3)
assert encoded == '{"a_float":{"0":1.0}}'
df = pd.DataFrame([dict(a_float=0.99999999999999944)])
encoded = df.to_json(double_precision=15)
assert encoded == '{"a_float":{"0":1.0}}'
def test_frame_to_json_except(self):
df = DataFrame([1, 2, 3])
pytest.raises(ValueError, df.to_json, orient="garbage")
def test_frame_empty(self):
df = DataFrame(columns=['jim', 'joe'])
assert not df._is_mixed_type
assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
check_index_type=False)
# GH 7445
result = pd.DataFrame({'test': []}, index=[]).to_json(orient='columns')
expected = '{"test":{}}'
assert result == expected
def test_frame_empty_mixedtype(self):
# mixed type
df = DataFrame(columns=['jim', 'joe'])
df['joe'] = df['joe'].astype('i8')
assert df._is_mixed_type
assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
check_index_type=False)
def test_frame_mixedtype_orient(self): # GH10289
vals = [[10, 1, 'foo', .1, .01],
[20, 2, 'bar', .2, .02],
[30, 3, 'baz', .3, .03],
[40, 4, 'qux', .4, .04]]
df = DataFrame(vals, index=list('abcd'),
columns=['1st', '2nd', '3rd', '4th', '5th'])
assert df._is_mixed_type
right = df.copy()
for orient in ['split', 'index', 'columns']:
inp = df.to_json(orient=orient)
left = read_json(inp, orient=orient, convert_axes=False)
assert_frame_equal(left, right)
right.index = np.arange(len(df))
inp = df.to_json(orient='records')
left = read_json(inp, orient='records', convert_axes=False)
assert_frame_equal(left, right)
right.columns = np.arange(df.shape[1])
inp = df.to_json(orient='values')
left = read_json(inp, orient='values', convert_axes=False)
assert_frame_equal(left, right)
def test_v12_compat(self):
df = DataFrame(
[[1.56808523, 0.65727391, 1.81021139, -0.17251653],
[-0.2550111, -0.08072427, -0.03202878, -0.17581665],
[1.51493992, 0.11805825, 1.629455, -1.31506612],
[-0.02765498, 0.44679743, 0.33192641, -0.27885413],
[0.05951614, -2.69652057, 1.28163262, 0.34703478]],
columns=['A', 'B', 'C', 'D'],
index=pd.date_range('2000-01-03', '2000-01-07'))
df['date'] = pd.Timestamp('19920106 18:21:32.12')
df.iloc[3, df.columns.get_loc('date')] = pd.Timestamp('20130101')
df['modified'] = df['date']
df.iloc[1, df.columns.get_loc('modified')] = pd.NaT
v12_json = os.path.join(self.dirpath, 'tsframe_v012.json')
df_unser = pd.read_json(v12_json)
assert_frame_equal(df, df_unser)
df_iso = df.drop(['modified'], axis=1)
v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json')
df_unser_iso = pd.read_json(v12_iso_json)
assert_frame_equal(df_iso, df_unser_iso)
def test_blocks_compat_GH9037(self):
index = pd.date_range('20000101', periods=10, freq='H')
df_mixed = DataFrame(OrderedDict(
float_1=[-0.92077639, 0.77434435, 1.25234727, 0.61485564,
-0.60316077, 0.24653374, 0.28668979, -2.51969012,
0.95748401, -1.02970536],
int_1=[19680418, 75337055, 99973684, 65103179, 79373900,
40314334, 21290235, 4991321, 41903419, 16008365],
str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474',
'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'],
float_2=[-0.0428278, -1.80872357, 3.36042349, -0.7573685,
-0.48217572, 0.86229683, 1.08935819, 0.93898739,
-0.03030452, 1.43366348],
str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9',
'08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'],
int_2=[86967717, 98098830, 51927505, 20372254, 12601730, 20884027,
34193846, 10561746, 24867120, 76131025]
), index=index)
# JSON deserialisation always creates unicode strings
df_mixed.columns = df_mixed.columns.astype('unicode')
df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'),
orient='split')
assert_frame_equal(df_mixed, df_roundtrip,
check_index_type=True,
check_column_type=True,
check_frame_type=True,
by_blocks=True,
check_exact=True)
def test_frame_nonprintable_bytes(self):
# GH14256: failing column caused segfaults, if it is not the last one
class BinaryThing(object):
def __init__(self, hexed):
self.hexed = hexed
if compat.PY2:
self.binary = hexed.decode('hex')
else:
self.binary = bytes.fromhex(hexed)
def __str__(self):
return self.hexed
hexed = '574b4454ba8c5eb4f98a8f45'
binthing = BinaryThing(hexed)
# verify the proper conversion of printable content
df_printable = DataFrame({'A': [binthing.hexed]})
assert df_printable.to_json() == \
'{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed)
# check if non-printable content throws appropriate Exception
df_nonprintable = DataFrame({'A': [binthing]})
with pytest.raises(OverflowError):
df_nonprintable.to_json()
# the same with multiple columns threw segfaults
df_mixed = DataFrame({'A': [binthing], 'B': [1]},
columns=['A', 'B'])
with pytest.raises(OverflowError):
df_mixed.to_json()
# default_handler should resolve exceptions for non-string types
assert df_nonprintable.to_json(default_handler=str) == \
'{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed)
assert df_mixed.to_json(default_handler=str) == \
'{{"A":{{"0":"{hex}"}},"B":{{"0":1}}}}'.format(hex=hexed)
def test_label_overflow(self):
# GH14256: buffer length not checked when writing label
df = pd.DataFrame({'bar' * 100000: [1], 'foo': [1337]})
assert df.to_json() == \
'{{"{bar}":{{"0":1}},"foo":{{"0":1337}}}}'.format(
bar=('bar' * 100000))
def test_series_non_unique_index(self):
s = Series(['a', 'b'], index=[1, 1])
pytest.raises(ValueError, s.to_json, orient='index')
assert_series_equal(s, read_json(s.to_json(orient='split'),
orient='split', typ='series'))
unser = read_json(s.to_json(orient='records'),
orient='records', typ='series')
tm.assert_numpy_array_equal(s.values, unser.values)
def test_series_from_json_to_json(self):
def _check_orient(series, orient, dtype=None, numpy=False,
check_index_type=True):
series = series.sort_index()
unser = read_json(series.to_json(orient=orient),
typ='series', orient=orient, numpy=numpy,
dtype=dtype)
unser = unser.sort_index()
if orient == "records" or orient == "values":
assert_almost_equal(series.values, unser.values)
else:
if orient == "split":
assert_series_equal(series, unser,
check_index_type=check_index_type)
else:
assert_series_equal(series, unser, check_names=False,
check_index_type=check_index_type)
def _check_all_orients(series, dtype=None, check_index_type=True):
_check_orient(series, "columns", dtype=dtype,
check_index_type=check_index_type)
_check_orient(series, "records", dtype=dtype,
check_index_type=check_index_type)
_check_orient(series, "split", dtype=dtype,
check_index_type=check_index_type)
_check_orient(series, "index", dtype=dtype,
check_index_type=check_index_type)
_check_orient(series, "values", dtype=dtype)
_check_orient(series, "columns", dtype=dtype, numpy=True,
check_index_type=check_index_type)
_check_orient(series, "records", dtype=dtype, numpy=True,
check_index_type=check_index_type)
_check_orient(series, "split", dtype=dtype, numpy=True,
check_index_type=check_index_type)
_check_orient(series, "index", dtype=dtype, numpy=True,
check_index_type=check_index_type)
_check_orient(series, "values", dtype=dtype, numpy=True,
check_index_type=check_index_type)
# basic
_check_all_orients(self.series)
assert self.series.to_json() == self.series.to_json(orient="index")
objSeries = Series([str(d) for d in self.objSeries],
index=self.objSeries.index,
name=self.objSeries.name)
_check_all_orients(objSeries, dtype=False)
# empty_series has empty index with object dtype
# which cannot be revert
assert self.empty_series.index.dtype == np.object_
_check_all_orients(self.empty_series, check_index_type=False)
_check_all_orients(self.ts)
# dtype
s = Series(lrange(6), index=['a', 'b', 'c', 'd', 'e', 'f'])
_check_all_orients(Series(s, dtype=np.float64), dtype=np.float64)
_check_all_orients(Series(s, dtype=np.int), dtype=np.int)
def test_series_to_json_except(self):
s = Series([1, 2, 3])
pytest.raises(ValueError, s.to_json, orient="garbage")
def test_series_from_json_precise_float(self):
s = Series([4.56, 4.56, 4.56])
result = read_json(s.to_json(), typ='series', precise_float=True)
assert_series_equal(result, s, check_index_type=False)
def test_frame_from_json_precise_float(self):
df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
result = read_json(df.to_json(), precise_float=True)
assert_frame_equal(result, df, check_index_type=False,
check_column_type=False)
def test_typ(self):
s = Series(lrange(6), index=['a', 'b', 'c',
'd', 'e', 'f'], dtype='int64')
result = read_json(s.to_json(), typ=None)
assert_series_equal(result, s)
def test_reconstruction_index(self):
df = DataFrame([[1, 2, 3], [4, 5, 6]])
result = read_json(df.to_json())
assert_frame_equal(result, df)
df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=['A', 'B', 'C'])
result = read_json(df.to_json())
assert_frame_equal(result, df)
def test_path(self):
with ensure_clean('test.json') as path:
for df in [self.frame, self.frame2, self.intframe, self.tsframe,
self.mixed_frame]:
df.to_json(path)
read_json(path)
def test_axis_dates(self):
# frame
json = self.tsframe.to_json()
result = read_json(json)
assert_frame_equal(result, self.tsframe)
# series
json = self.ts.to_json()
result = read_json(json, typ='series')
assert_series_equal(result, self.ts, check_names=False)
assert result.name is None
def test_convert_dates(self):
# frame
df = self.tsframe.copy()
df['date'] = Timestamp('20130101')
json = df.to_json()
result = read_json(json)
assert_frame_equal(result, df)
df['foo'] = 1.
json = df.to_json(date_unit='ns')
result = read_json(json, convert_dates=False)
expected = df.copy()
expected['date'] = expected['date'].values.view('i8')
expected['foo'] = expected['foo'].astype('int64')
assert_frame_equal(result, expected)
# series
ts = Series(Timestamp('20130101'), index=self.ts.index)
json = ts.to_json()
result = read_json(json, typ='series')
assert_series_equal(result, ts)
def test_convert_dates_infer(self):
# GH10747
from pandas.io.json import dumps
infer_words = ['trade_time', 'date', 'datetime', 'sold_at',
'modified', 'timestamp', 'timestamps']
for infer_word in infer_words:
data = [{'id': 1, infer_word: 1036713600000}, {'id': 2}]
expected = DataFrame([[1, Timestamp('2002-11-08')], [2, pd.NaT]],
columns=['id', infer_word])
result = read_json(dumps(data))[['id', infer_word]]
assert_frame_equal(result, expected)
def test_date_format_frame(self):
df = self.tsframe.copy()
def test_w_date(date, date_unit=None):
df['date'] = Timestamp(date)
df.iloc[1, df.columns.get_loc('date')] = pd.NaT
df.iloc[5, df.columns.get_loc('date')] = pd.NaT
if date_unit:
json = df.to_json(date_format='iso', date_unit=date_unit)
else:
json = df.to_json(date_format='iso')
result = read_json(json)
assert_frame_equal(result, df)
test_w_date('20130101 20:43:42.123')
test_w_date('20130101 20:43:42', date_unit='s')
test_w_date('20130101 20:43:42.123', date_unit='ms')
test_w_date('20130101 20:43:42.123456', date_unit='us')
test_w_date('20130101 20:43:42.123456789', date_unit='ns')
pytest.raises(ValueError, df.to_json, date_format='iso',
date_unit='foo')
def test_date_format_series(self):
def test_w_date(date, date_unit=None):
ts = Series(Timestamp(date), index=self.ts.index)
ts.iloc[1] = pd.NaT
ts.iloc[5] = pd.NaT
if date_unit:
json = ts.to_json(date_format='iso', date_unit=date_unit)
else:
json = ts.to_json(date_format='iso')
result = read_json(json, typ='series')
assert_series_equal(result, ts)
test_w_date('20130101 20:43:42.123')
test_w_date('20130101 20:43:42', date_unit='s')
test_w_date('20130101 20:43:42.123', date_unit='ms')
test_w_date('20130101 20:43:42.123456', date_unit='us')
test_w_date('20130101 20:43:42.123456789', date_unit='ns')
ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index)
pytest.raises(ValueError, ts.to_json, date_format='iso',
date_unit='foo')
def test_date_unit(self):
df = self.tsframe.copy()
df['date'] = Timestamp('20130101 20:43:42')
dl = df.columns.get_loc('date')
df.iloc[1, dl] = Timestamp('19710101 20:43:42')
df.iloc[2, dl] = Timestamp('21460101 20:43:42')
df.iloc[4, dl] = pd.NaT
for unit in ('s', 'ms', 'us', 'ns'):
json = df.to_json(date_format='epoch', date_unit=unit)
# force date unit
result = read_json(json, date_unit=unit)
assert_frame_equal(result, df)
# detect date unit
result = read_json(json, date_unit=None)
assert_frame_equal(result, df)
def test_weird_nested_json(self):
# this used to core dump the parser
s = r'''{
"status": "success",
"data": {
"posts": [
{
"id": 1,
"title": "A blog post",
"body": "Some useful content"
},
{
"id": 2,
"title": "Another blog post",
"body": "More content"
}
]
}
}'''
read_json(s)
def test_doc_example(self):
dfj2 = DataFrame(np.random.randn(5, 2), columns=list('AB'))
dfj2['date'] = Timestamp('20130101')
dfj2['ints'] = lrange(5)
dfj2['bools'] = True
dfj2.index = pd.date_range('20130101', periods=5)
json = dfj2.to_json()
result = read_json(json, dtype={'ints': np.int64, 'bools': np.bool_})
assert_frame_equal(result, result)
def test_misc_example(self):
# parsing unordered input fails
result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
error_msg = """DataFrame\\.index are different
DataFrame\\.index values are different \\(100\\.0 %\\)
\\[left\\]: Index\\(\\[u?'a', u?'b'\\], dtype='object'\\)
\\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)"""
with tm.assert_raises_regex(AssertionError, error_msg):
assert_frame_equal(result, expected, check_index_type=False)
result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]')
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
@network
def test_round_trip_exception_(self):
# GH 3867
csv = 'https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv'
df = pd.read_csv(csv)
s = df.to_json()
result = pd.read_json(s)
assert_frame_equal(result.reindex(
index=df.index, columns=df.columns), df)
@network
def test_url(self):
url = 'https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5' # noqa
result = read_json(url, convert_dates=True)
for c in ['created_at', 'closed_at', 'updated_at']:
assert result[c].dtype == 'datetime64[ns]'
def test_timedelta(self):
converter = lambda x: pd.to_timedelta(x, unit='ms')
s = Series([timedelta(23), timedelta(seconds=5)])
assert s.dtype == 'timedelta64[ns]'
result = pd.read_json(s.to_json(), typ='series').apply(converter)
assert_series_equal(result, s)
s = Series([timedelta(23), timedelta(seconds=5)],
index=pd.Index([0, 1]))
assert s.dtype == 'timedelta64[ns]'
result = pd.read_json(s.to_json(), typ='series').apply(converter)
assert_series_equal(result, s)
frame = DataFrame([timedelta(23), timedelta(seconds=5)])
assert frame[0].dtype == 'timedelta64[ns]'
assert_frame_equal(frame, pd.read_json(frame.to_json())
.apply(converter))
frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)],
'b': [1, 2],
'c': pd.date_range(start='20130101', periods=2)})
result = pd.read_json(frame.to_json(date_unit='ns'))
result['a'] = pd.to_timedelta(result.a, unit='ns')
result['c'] = pd.to_datetime(result.c)
assert_frame_equal(frame, result)
def test_mixed_timedelta_datetime(self):
frame = DataFrame({'a': [timedelta(23), pd.Timestamp('20130101')]},
dtype=object)
expected = DataFrame({'a': [pd.Timedelta(frame.a[0]).value,
pd.Timestamp(frame.a[1]).value]})
result = pd.read_json(frame.to_json(date_unit='ns'),
dtype={'a': 'int64'})
assert_frame_equal(result, expected, check_index_type=False)
def test_default_handler(self):
value = object()
frame = DataFrame({'a': [7, value]})
expected = DataFrame({'a': [7, str(value)]})
result = pd.read_json(frame.to_json(default_handler=str))
assert_frame_equal(expected, result, check_index_type=False)
def test_default_handler_indirect(self):
from pandas.io.json import dumps
def default(obj):
if isinstance(obj, complex):
return [('mathjs', 'Complex'),
('re', obj.real),
('im', obj.imag)]
return str(obj)
df_list = [9, DataFrame({'a': [1, 'STR', complex(4, -5)],
'b': [float('nan'), None, 'N/A']},
columns=['a', 'b'])]
expected = ('[9,[[1,null],["STR",null],[[["mathjs","Complex"],'
'["re",4.0],["im",-5.0]],"N\\/A"]]]')
assert dumps(df_list, default_handler=default,
orient="values") == expected
def test_default_handler_numpy_unsupported_dtype(self):
# GH12554 to_json raises 'Unhandled numpy dtype 15'
df = DataFrame({'a': [1, 2.3, complex(4, -5)],
'b': [float('nan'), None, complex(1.2, 0)]},
columns=['a', 'b'])
expected = ('[["(1+0j)","(nan+0j)"],'
'["(2.3+0j)","(nan+0j)"],'
'["(4-5j)","(1.2+0j)"]]')
assert df.to_json(default_handler=str, orient="values") == expected
def test_default_handler_raises(self):
def my_handler_raises(obj):
raise TypeError("raisin")
pytest.raises(TypeError,
DataFrame({'a': [1, 2, object()]}).to_json,
default_handler=my_handler_raises)
pytest.raises(TypeError,
DataFrame({'a': [1, 2, complex(4, -5)]}).to_json,
default_handler=my_handler_raises)
def test_categorical(self):
# GH4377 df.to_json segfaults with non-ndarray blocks
df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]})
df["B"] = df["A"]
expected = df.to_json()
df["B"] = df["A"].astype('category')
assert expected == df.to_json()
s = df["A"]
sc = df["B"]
assert s.to_json() == sc.to_json()
def test_datetime_tz(self):
# GH4377 df.to_json segfaults with non-ndarray blocks
tz_range = pd.date_range('20130101', periods=3, tz='US/Eastern')
tz_naive = tz_range.tz_convert('utc').tz_localize(None)
df = DataFrame({
'A': tz_range,
'B': pd.date_range('20130101', periods=3)})
df_naive = df.copy()
df_naive['A'] = tz_naive
expected = df_naive.to_json()
assert expected == df.to_json()
stz = Series(tz_range)
s_naive = Series(tz_naive)
assert stz.to_json() == s_naive.to_json()
def test_sparse(self):
# GH4377 df.to_json segfaults with non-ndarray blocks
df = pd.DataFrame(np.random.randn(10, 4))
df.loc[:8] = np.nan
sdf = df.to_sparse()
expected = df.to_json()
assert expected == sdf.to_json()
s = pd.Series(np.random.randn(10))
s.loc[:8] = np.nan
ss = s.to_sparse()
expected = s.to_json()
assert expected == ss.to_json()
def test_tz_is_utc(self):
from pandas.io.json import dumps
exp = '"2013-01-10T05:00:00.000Z"'
ts = Timestamp('2013-01-10 05:00:00Z')
assert dumps(ts, iso_dates=True) == exp
dt = ts.to_pydatetime()
assert dumps(dt, iso_dates=True) == exp
ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern')
assert dumps(ts, iso_dates=True) == exp
dt = ts.to_pydatetime()
assert dumps(dt, iso_dates=True) == exp
ts = Timestamp('2013-01-10 00:00:00-0500')
assert dumps(ts, iso_dates=True) == exp
dt = ts.to_pydatetime()
assert dumps(dt, iso_dates=True) == exp
def test_tz_range_is_utc(self):
from pandas.io.json import dumps
exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]'
dfexp = ('{"DT":{'
'"0":"2013-01-01T05:00:00.000Z",'
'"1":"2013-01-02T05:00:00.000Z"}}')
tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2)
assert dumps(tz_range, iso_dates=True) == exp
dti = pd.DatetimeIndex(tz_range)
assert dumps(dti, iso_dates=True) == exp
df = DataFrame({'DT': dti})
assert dumps(df, iso_dates=True) == dfexp
tz_range = pd.date_range('2013-01-01 00:00:00', periods=2,
tz='US/Eastern')
assert dumps(tz_range, iso_dates=True) == exp
dti = pd.DatetimeIndex(tz_range)
assert dumps(dti, iso_dates=True) == exp
df = DataFrame({'DT': dti})
assert dumps(df, iso_dates=True) == dfexp
tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2)
assert dumps(tz_range, iso_dates=True) == exp
dti = pd.DatetimeIndex(tz_range)
assert dumps(dti, iso_dates=True) == exp
df = DataFrame({'DT': dti})
assert dumps(df, iso_dates=True) == dfexp
def test_read_inline_jsonl(self):
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_read_s3_jsonl(self, s3_resource):
# GH17200
result = read_json('s3n://pandas-test/items.jsonl', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_read_local_jsonl(self):
# GH17200
with ensure_clean('tmp_items.json') as path:
with open(path, 'w') as infile:
infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
result = read_json(path, lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_read_jsonl_unicode_chars(self):
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
# simulate file handle
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
json = StringIO(json)
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)
# simulate string
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
result = read_json(json, lines=True)
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
columns=['a', 'b'])
assert_frame_equal(result, expected)
def test_read_json_large_numbers(self):
# GH18842
json = '{"articleId": "1404366058080022500245"}'
json = StringIO(json)
result = read_json(json, typ="series")
expected = Series(1.404366e+21, index=['articleId'])
assert_series_equal(result, expected)
json = '{"0": {"articleId": "1404366058080022500245"}}'
json = StringIO(json)
result = read_json(json)
expected = DataFrame(1.404366e+21, index=['articleId'], columns=[0])
assert_frame_equal(result, expected)
def test_to_jsonl(self):
# GH9180
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
assert result == expected
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
result = df.to_json(orient="records", lines=True)
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
assert result == expected
assert_frame_equal(pd.read_json(result, lines=True), df)
# GH15096: escaped characters in columns and data
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
columns=["a\\", 'b'])
result = df.to_json(orient="records", lines=True)
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
'{"a\\\\":"foo\\"","b":"bar"}')
assert result == expected
assert_frame_equal(pd.read_json(result, lines=True), df)
def test_latin_encoding(self):
if compat.PY2:
tm.assert_raises_regex(
TypeError, r'\[unicode\] is not implemented as a table column')
return
# GH 13774
pytest.skip("encoding not implemented in .to_json(), "
"xref #13774")
values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
[b'E\xc9, 17', b'a', b'b', b'c'],
[b'EE, 17', b'', b'a', b'b', b'c'],
[b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
[b'', b'a', b'b', b'c'],
[b'\xf8\xfc', b'a', b'b', b'c'],
[b'A\xf8\xfc', b'', b'a', b'b', b'c'],
[np.nan, b'', b'b', b'c'],
[b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
def _try_decode(x, encoding='latin-1'):
try:
return x.decode(encoding)
except AttributeError:
return x
# not sure how to remove latin-1 from code in python 2 and 3
values = [[_try_decode(x) for x in y] for y in values]
examples = []
for dtype in ['category', object]:
for val in values:
examples.append(Series(val, dtype=dtype))
def roundtrip(s, encoding='latin-1'):
with ensure_clean('test.json') as path:
s.to_json(path, encoding=encoding)
retr = read_json(path, encoding=encoding)
assert_series_equal(s, retr, check_categorical=False)
for s in examples:
roundtrip(s)
def test_data_frame_size_after_to_json(self):
# GH15344
df = DataFrame({'a': [str(1)]})
size_before = df.memory_usage(index=True, deep=True).sum()
df.to_json()
size_after = df.memory_usage(index=True, deep=True).sum()
assert size_before == size_after
@pytest.mark.parametrize('data, expected', [
(DataFrame([[1, 2], [4, 5]], columns=['a', 'b']),
{'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
(DataFrame([[1, 2], [4, 5]], columns=['a', 'b']).rename_axis('foo'),
{'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
(DataFrame([[1, 2], [4, 5]], columns=['a', 'b'],
index=[['a', 'b'], ['c', 'd']]),
{'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
(Series([1, 2, 3], name='A'),
{'name': 'A', 'data': [1, 2, 3]}),
(Series([1, 2, 3], name='A').rename_axis('foo'),
{'name': 'A', 'data': [1, 2, 3]}),
(Series([1, 2], name='A', index=[['a', 'b'], ['c', 'd']]),
{'name': 'A', 'data': [1, 2]}),
])
def test_index_false_to_json_split(self, data, expected):
# GH 17394
# Testing index=False in to_json with orient='split'
result = data.to_json(orient='split', index=False)
result = json.loads(result)
assert result == expected
@pytest.mark.parametrize('data', [
(DataFrame([[1, 2], [4, 5]], columns=['a', 'b'])),
(DataFrame([[1, 2], [4, 5]], columns=['a', 'b']).rename_axis('foo')),
(DataFrame([[1, 2], [4, 5]], columns=['a', 'b'],
index=[['a', 'b'], ['c', 'd']])),
(Series([1, 2, 3], name='A')),
(Series([1, 2, 3], name='A').rename_axis('foo')),
(Series([1, 2], name='A', index=[['a', 'b'], ['c', 'd']])),
])
def test_index_false_to_json_table(self, data):
# GH 17394
# Testing index=False in to_json with orient='table'
result = data.to_json(orient='table', index=False)
result = json.loads(result)
expected = {
'schema': pd.io.json.build_table_schema(data, index=False),
'data': DataFrame(data).to_dict(orient='records')
}
assert result == expected
@pytest.mark.parametrize('orient', [
'records', 'index', 'columns', 'values'
])
def test_index_false_error_to_json(self, orient):
# GH 17394
# Testing error message from to_json with index=False
df = pd.DataFrame([[1, 2], [4, 5]], columns=['a', 'b'])
with tm.assert_raises_regex(ValueError, "'index=False' is only "
"valid when 'orient' is "
"'split' or 'table'"):
df.to_json(orient=orient, index=False)