2500 lines
97 KiB
Python
2500 lines
97 KiB
Python
from warnings import catch_warnings
|
|
from itertools import combinations, product
|
|
|
|
import datetime as dt
|
|
import dateutil
|
|
import numpy as np
|
|
from numpy.random import randn
|
|
|
|
from datetime import datetime
|
|
from pandas.compat import StringIO, iteritems, PY2
|
|
import pandas as pd
|
|
from pandas import (DataFrame, concat,
|
|
read_csv, isna, Series, date_range,
|
|
Index, Panel, MultiIndex, Timestamp,
|
|
DatetimeIndex, Categorical)
|
|
from pandas.core.dtypes.dtypes import CategoricalDtype
|
|
from pandas.util import testing as tm
|
|
from pandas.util.testing import (assert_frame_equal,
|
|
makeCustomDataframe as mkdf)
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture(params=[True, False])
|
|
def sort(request):
|
|
"""Boolean sort keyword for concat and DataFrame.append."""
|
|
return request.param
|
|
|
|
|
|
@pytest.fixture(params=[True, False, None])
|
|
def sort_with_none(request):
|
|
"""Boolean sort keyword for concat and DataFrame.append.
|
|
|
|
Includes the default of None
|
|
"""
|
|
# TODO: Replace with sort once keyword changes.
|
|
return request.param
|
|
|
|
|
|
class ConcatenateBase(object):
|
|
|
|
def setup_method(self, method):
|
|
self.frame = DataFrame(tm.getSeriesData())
|
|
self.mixed_frame = self.frame.copy()
|
|
self.mixed_frame['foo'] = 'bar'
|
|
|
|
|
|
class TestConcatAppendCommon(ConcatenateBase):
|
|
|
|
"""
|
|
Test common dtype coercion rules between concat and append.
|
|
"""
|
|
|
|
def setup_method(self, method):
|
|
|
|
dt_data = [pd.Timestamp('2011-01-01'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-03')]
|
|
tz_data = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-03', tz='US/Eastern')]
|
|
|
|
td_data = [pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Timedelta('3 days')]
|
|
|
|
period_data = [pd.Period('2011-01', freq='M'),
|
|
pd.Period('2011-02', freq='M'),
|
|
pd.Period('2011-03', freq='M')]
|
|
|
|
self.data = {'bool': [True, False, True],
|
|
'int64': [1, 2, 3],
|
|
'float64': [1.1, np.nan, 3.3],
|
|
'category': pd.Categorical(['X', 'Y', 'Z']),
|
|
'object': ['a', 'b', 'c'],
|
|
'datetime64[ns]': dt_data,
|
|
'datetime64[ns, US/Eastern]': tz_data,
|
|
'timedelta64[ns]': td_data,
|
|
'period[M]': period_data}
|
|
|
|
def _check_expected_dtype(self, obj, label):
|
|
"""
|
|
Check whether obj has expected dtype depending on label
|
|
considering not-supported dtypes
|
|
"""
|
|
if isinstance(obj, pd.Index):
|
|
if label == 'bool':
|
|
assert obj.dtype == 'object'
|
|
else:
|
|
assert obj.dtype == label
|
|
elif isinstance(obj, pd.Series):
|
|
if label.startswith('period'):
|
|
assert obj.dtype == 'object'
|
|
else:
|
|
assert obj.dtype == label
|
|
else:
|
|
raise ValueError
|
|
|
|
def test_dtypes(self):
|
|
# to confirm test case covers intended dtypes
|
|
for typ, vals in iteritems(self.data):
|
|
self._check_expected_dtype(pd.Index(vals), typ)
|
|
self._check_expected_dtype(pd.Series(vals), typ)
|
|
|
|
def test_concatlike_same_dtypes(self):
|
|
# GH 13660
|
|
for typ1, vals1 in iteritems(self.data):
|
|
|
|
vals2 = vals1
|
|
vals3 = vals1
|
|
|
|
if typ1 == 'category':
|
|
exp_data = pd.Categorical(list(vals1) + list(vals2))
|
|
exp_data3 = pd.Categorical(list(vals1) + list(vals2) +
|
|
list(vals3))
|
|
else:
|
|
exp_data = vals1 + vals2
|
|
exp_data3 = vals1 + vals2 + vals3
|
|
|
|
# ----- Index ----- #
|
|
|
|
# index.append
|
|
res = pd.Index(vals1).append(pd.Index(vals2))
|
|
exp = pd.Index(exp_data)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
# 3 elements
|
|
res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)])
|
|
exp = pd.Index(exp_data3)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
# index.append name mismatch
|
|
i1 = pd.Index(vals1, name='x')
|
|
i2 = pd.Index(vals2, name='y')
|
|
res = i1.append(i2)
|
|
exp = pd.Index(exp_data)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
# index.append name match
|
|
i1 = pd.Index(vals1, name='x')
|
|
i2 = pd.Index(vals2, name='x')
|
|
res = i1.append(i2)
|
|
exp = pd.Index(exp_data, name='x')
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
# cannot append non-index
|
|
with tm.assert_raises_regex(TypeError,
|
|
'all inputs must be Index'):
|
|
pd.Index(vals1).append(vals2)
|
|
|
|
with tm.assert_raises_regex(TypeError,
|
|
'all inputs must be Index'):
|
|
pd.Index(vals1).append([pd.Index(vals2), vals3])
|
|
|
|
# ----- Series ----- #
|
|
|
|
# series.append
|
|
res = pd.Series(vals1).append(pd.Series(vals2),
|
|
ignore_index=True)
|
|
exp = pd.Series(exp_data)
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
# concat
|
|
res = pd.concat([pd.Series(vals1), pd.Series(vals2)],
|
|
ignore_index=True)
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
# 3 elements
|
|
res = pd.Series(vals1).append([pd.Series(vals2), pd.Series(vals3)],
|
|
ignore_index=True)
|
|
exp = pd.Series(exp_data3)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = pd.concat([pd.Series(vals1), pd.Series(vals2),
|
|
pd.Series(vals3)], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
# name mismatch
|
|
s1 = pd.Series(vals1, name='x')
|
|
s2 = pd.Series(vals2, name='y')
|
|
res = s1.append(s2, ignore_index=True)
|
|
exp = pd.Series(exp_data)
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
res = pd.concat([s1, s2], ignore_index=True)
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
# name match
|
|
s1 = pd.Series(vals1, name='x')
|
|
s2 = pd.Series(vals2, name='x')
|
|
res = s1.append(s2, ignore_index=True)
|
|
exp = pd.Series(exp_data, name='x')
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
res = pd.concat([s1, s2], ignore_index=True)
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
# cannot append non-index
|
|
msg = (r'cannot concatenate object of type \"(.+?)\";'
|
|
' only pd.Series, pd.DataFrame, and pd.Panel'
|
|
r' \(deprecated\) objs are valid')
|
|
with tm.assert_raises_regex(TypeError, msg):
|
|
pd.Series(vals1).append(vals2)
|
|
|
|
with tm.assert_raises_regex(TypeError, msg):
|
|
pd.Series(vals1).append([pd.Series(vals2), vals3])
|
|
|
|
with tm.assert_raises_regex(TypeError, msg):
|
|
pd.concat([pd.Series(vals1), vals2])
|
|
|
|
with tm.assert_raises_regex(TypeError, msg):
|
|
pd.concat([pd.Series(vals1), pd.Series(vals2), vals3])
|
|
|
|
def test_concatlike_dtypes_coercion(self):
|
|
# GH 13660
|
|
for typ1, vals1 in iteritems(self.data):
|
|
for typ2, vals2 in iteritems(self.data):
|
|
|
|
vals3 = vals2
|
|
|
|
# basically infer
|
|
exp_index_dtype = None
|
|
exp_series_dtype = None
|
|
|
|
if typ1 == typ2:
|
|
# same dtype is tested in test_concatlike_same_dtypes
|
|
continue
|
|
elif typ1 == 'category' or typ2 == 'category':
|
|
# ToDo: suspicious
|
|
continue
|
|
|
|
# specify expected dtype
|
|
if typ1 == 'bool' and typ2 in ('int64', 'float64'):
|
|
# series coerces to numeric based on numpy rule
|
|
# index doesn't because bool is object dtype
|
|
exp_series_dtype = typ2
|
|
elif typ2 == 'bool' and typ1 in ('int64', 'float64'):
|
|
exp_series_dtype = typ1
|
|
elif (typ1 == 'datetime64[ns, US/Eastern]' or
|
|
typ2 == 'datetime64[ns, US/Eastern]' or
|
|
typ1 == 'timedelta64[ns]' or
|
|
typ2 == 'timedelta64[ns]'):
|
|
exp_index_dtype = object
|
|
exp_series_dtype = object
|
|
|
|
exp_data = vals1 + vals2
|
|
exp_data3 = vals1 + vals2 + vals3
|
|
|
|
# ----- Index ----- #
|
|
|
|
# index.append
|
|
res = pd.Index(vals1).append(pd.Index(vals2))
|
|
exp = pd.Index(exp_data, dtype=exp_index_dtype)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
# 3 elements
|
|
res = pd.Index(vals1).append([pd.Index(vals2),
|
|
pd.Index(vals3)])
|
|
exp = pd.Index(exp_data3, dtype=exp_index_dtype)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
# ----- Series ----- #
|
|
|
|
# series.append
|
|
res = pd.Series(vals1).append(pd.Series(vals2),
|
|
ignore_index=True)
|
|
exp = pd.Series(exp_data, dtype=exp_series_dtype)
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
# concat
|
|
res = pd.concat([pd.Series(vals1), pd.Series(vals2)],
|
|
ignore_index=True)
|
|
tm.assert_series_equal(res, exp, check_index_type=True)
|
|
|
|
# 3 elements
|
|
res = pd.Series(vals1).append([pd.Series(vals2),
|
|
pd.Series(vals3)],
|
|
ignore_index=True)
|
|
exp = pd.Series(exp_data3, dtype=exp_series_dtype)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = pd.concat([pd.Series(vals1), pd.Series(vals2),
|
|
pd.Series(vals3)], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
def test_concatlike_common_coerce_to_pandas_object(self):
|
|
# GH 13626
|
|
# result must be Timestamp/Timedelta, not datetime.datetime/timedelta
|
|
dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'])
|
|
tdi = pd.TimedeltaIndex(['1 days', '2 days'])
|
|
|
|
exp = pd.Index([pd.Timestamp('2011-01-01'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days')])
|
|
|
|
res = dti.append(tdi)
|
|
tm.assert_index_equal(res, exp)
|
|
assert isinstance(res[0], pd.Timestamp)
|
|
assert isinstance(res[-1], pd.Timedelta)
|
|
|
|
dts = pd.Series(dti)
|
|
tds = pd.Series(tdi)
|
|
res = dts.append(tds)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
assert isinstance(res.iloc[0], pd.Timestamp)
|
|
assert isinstance(res.iloc[-1], pd.Timedelta)
|
|
|
|
res = pd.concat([dts, tds])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
assert isinstance(res.iloc[0], pd.Timestamp)
|
|
assert isinstance(res.iloc[-1], pd.Timedelta)
|
|
|
|
def test_concatlike_datetimetz(self, tz_aware_fixture):
|
|
tz = tz_aware_fixture
|
|
# GH 7795
|
|
dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz)
|
|
dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz=tz)
|
|
|
|
exp = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
|
|
'2012-01-01', '2012-01-02'], tz=tz)
|
|
|
|
res = dti1.append(dti2)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
dts1 = pd.Series(dti1)
|
|
dts2 = pd.Series(dti2)
|
|
res = dts1.append(dts2)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
res = pd.concat([dts1, dts2])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
@pytest.mark.parametrize('tz',
|
|
['UTC', 'US/Eastern', 'Asia/Tokyo', 'EST5EDT'])
|
|
def test_concatlike_datetimetz_short(self, tz):
|
|
# GH 7795
|
|
ix1 = pd.DatetimeIndex(start='2014-07-15', end='2014-07-17',
|
|
freq='D', tz=tz)
|
|
ix2 = pd.DatetimeIndex(['2014-07-11', '2014-07-21'], tz=tz)
|
|
df1 = pd.DataFrame(0, index=ix1, columns=['A', 'B'])
|
|
df2 = pd.DataFrame(0, index=ix2, columns=['A', 'B'])
|
|
|
|
exp_idx = pd.DatetimeIndex(['2014-07-15', '2014-07-16',
|
|
'2014-07-17', '2014-07-11',
|
|
'2014-07-21'], tz=tz)
|
|
exp = pd.DataFrame(0, index=exp_idx, columns=['A', 'B'])
|
|
|
|
tm.assert_frame_equal(df1.append(df2), exp)
|
|
tm.assert_frame_equal(pd.concat([df1, df2]), exp)
|
|
|
|
def test_concatlike_datetimetz_to_object(self, tz_aware_fixture):
|
|
tz = tz_aware_fixture
|
|
# GH 13660
|
|
|
|
# different tz coerces to object
|
|
dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz)
|
|
dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'])
|
|
|
|
exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz),
|
|
pd.Timestamp('2011-01-02', tz=tz),
|
|
pd.Timestamp('2012-01-01'),
|
|
pd.Timestamp('2012-01-02')], dtype=object)
|
|
|
|
res = dti1.append(dti2)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
dts1 = pd.Series(dti1)
|
|
dts2 = pd.Series(dti2)
|
|
res = dts1.append(dts2)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
res = pd.concat([dts1, dts2])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
# different tz
|
|
dti3 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'],
|
|
tz='US/Pacific')
|
|
|
|
exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz),
|
|
pd.Timestamp('2011-01-02', tz=tz),
|
|
pd.Timestamp('2012-01-01', tz='US/Pacific'),
|
|
pd.Timestamp('2012-01-02', tz='US/Pacific')],
|
|
dtype=object)
|
|
|
|
res = dti1.append(dti3)
|
|
# tm.assert_index_equal(res, exp)
|
|
|
|
dts1 = pd.Series(dti1)
|
|
dts3 = pd.Series(dti3)
|
|
res = dts1.append(dts3)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
res = pd.concat([dts1, dts3])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
def test_concatlike_common_period(self):
|
|
# GH 13660
|
|
pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
|
|
pi2 = pd.PeriodIndex(['2012-01', '2012-02'], freq='M')
|
|
|
|
exp = pd.PeriodIndex(['2011-01', '2011-02', '2012-01',
|
|
'2012-02'], freq='M')
|
|
|
|
res = pi1.append(pi2)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
ps1 = pd.Series(pi1)
|
|
ps2 = pd.Series(pi2)
|
|
res = ps1.append(ps2)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
res = pd.concat([ps1, ps2])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
def test_concatlike_common_period_diff_freq_to_object(self):
|
|
# GH 13221
|
|
pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
|
|
pi2 = pd.PeriodIndex(['2012-01-01', '2012-02-01'], freq='D')
|
|
|
|
exp = pd.Index([pd.Period('2011-01', freq='M'),
|
|
pd.Period('2011-02', freq='M'),
|
|
pd.Period('2012-01-01', freq='D'),
|
|
pd.Period('2012-02-01', freq='D')], dtype=object)
|
|
|
|
res = pi1.append(pi2)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
ps1 = pd.Series(pi1)
|
|
ps2 = pd.Series(pi2)
|
|
res = ps1.append(ps2)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
res = pd.concat([ps1, ps2])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
def test_concatlike_common_period_mixed_dt_to_object(self):
|
|
# GH 13221
|
|
# different datetimelike
|
|
pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
|
|
tdi = pd.TimedeltaIndex(['1 days', '2 days'])
|
|
exp = pd.Index([pd.Period('2011-01', freq='M'),
|
|
pd.Period('2011-02', freq='M'),
|
|
pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days')], dtype=object)
|
|
|
|
res = pi1.append(tdi)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
ps1 = pd.Series(pi1)
|
|
tds = pd.Series(tdi)
|
|
res = ps1.append(tds)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
res = pd.concat([ps1, tds])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
# inverse
|
|
exp = pd.Index([pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Period('2011-01', freq='M'),
|
|
pd.Period('2011-02', freq='M')], dtype=object)
|
|
|
|
res = tdi.append(pi1)
|
|
tm.assert_index_equal(res, exp)
|
|
|
|
ps1 = pd.Series(pi1)
|
|
tds = pd.Series(tdi)
|
|
res = tds.append(ps1)
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
res = pd.concat([tds, ps1])
|
|
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
|
|
|
|
def test_concat_categorical(self):
|
|
# GH 13524
|
|
|
|
# same categories -> category
|
|
s1 = pd.Series([1, 2, np.nan], dtype='category')
|
|
s2 = pd.Series([2, 1, 2], dtype='category')
|
|
|
|
exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category')
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
# partially different categories => not-category
|
|
s1 = pd.Series([3, 2], dtype='category')
|
|
s2 = pd.Series([2, 1], dtype='category')
|
|
|
|
exp = pd.Series([3, 2, 2, 1])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
# completely different categories (same dtype) => not-category
|
|
s1 = pd.Series([10, 11, np.nan], dtype='category')
|
|
s2 = pd.Series([np.nan, 1, 3, 2], dtype='category')
|
|
|
|
exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
def test_union_categorical_same_categories_different_order(self):
|
|
# https://github.com/pandas-dev/pandas/issues/19096
|
|
a = pd.Series(Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']))
|
|
b = pd.Series(Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']))
|
|
result = pd.concat([a, b], ignore_index=True)
|
|
expected = pd.Series(Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
|
|
categories=['a', 'b', 'c']))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_concat_categorical_coercion(self):
|
|
# GH 13524
|
|
|
|
# category + not-category => not-category
|
|
s1 = pd.Series([1, 2, np.nan], dtype='category')
|
|
s2 = pd.Series([2, 1, 2])
|
|
|
|
exp = pd.Series([1, 2, np.nan, 2, 1, 2])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
# result shouldn't be affected by 1st elem dtype
|
|
exp = pd.Series([2, 1, 2, 1, 2, np.nan])
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
|
|
|
|
# all values are not in category => not-category
|
|
s1 = pd.Series([3, 2], dtype='category')
|
|
s2 = pd.Series([2, 1])
|
|
|
|
exp = pd.Series([3, 2, 2, 1])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
exp = pd.Series([2, 1, 3, 2])
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
|
|
|
|
# completely different categories => not-category
|
|
s1 = pd.Series([10, 11, np.nan], dtype='category')
|
|
s2 = pd.Series([1, 3, 2])
|
|
|
|
exp = pd.Series([10, 11, np.nan, 1, 3, 2])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
exp = pd.Series([1, 3, 2, 10, 11, np.nan])
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
|
|
|
|
# different dtype => not-category
|
|
s1 = pd.Series([10, 11, np.nan], dtype='category')
|
|
s2 = pd.Series(['a', 'b', 'c'])
|
|
|
|
exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c'])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan])
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
|
|
|
|
# if normal series only contains NaN-likes => not-category
|
|
s1 = pd.Series([10, 11], dtype='category')
|
|
s2 = pd.Series([np.nan, np.nan, np.nan])
|
|
|
|
exp = pd.Series([10, 11, np.nan, np.nan, np.nan])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
exp = pd.Series([np.nan, np.nan, np.nan, 10, 11])
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
|
|
|
|
def test_concat_categorical_3elem_coercion(self):
|
|
# GH 13524
|
|
|
|
# mixed dtypes => not-category
|
|
s1 = pd.Series([1, 2, np.nan], dtype='category')
|
|
s2 = pd.Series([2, 1, 2], dtype='category')
|
|
s3 = pd.Series([1, 2, 1, 2, np.nan])
|
|
|
|
exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan])
|
|
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
|
|
|
|
exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2])
|
|
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
|
|
|
|
# values are all in either category => not-category
|
|
s1 = pd.Series([4, 5, 6], dtype='category')
|
|
s2 = pd.Series([1, 2, 3], dtype='category')
|
|
s3 = pd.Series([1, 3, 4])
|
|
|
|
exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4])
|
|
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
|
|
|
|
exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3])
|
|
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
|
|
|
|
# values are all in either category => not-category
|
|
s1 = pd.Series([4, 5, 6], dtype='category')
|
|
s2 = pd.Series([1, 2, 3], dtype='category')
|
|
s3 = pd.Series([10, 11, 12])
|
|
|
|
exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12])
|
|
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
|
|
|
|
exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3])
|
|
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
|
|
|
|
def test_concat_categorical_multi_coercion(self):
|
|
# GH 13524
|
|
|
|
s1 = pd.Series([1, 3], dtype='category')
|
|
s2 = pd.Series([3, 4], dtype='category')
|
|
s3 = pd.Series([2, 3])
|
|
s4 = pd.Series([2, 2], dtype='category')
|
|
s5 = pd.Series([1, np.nan])
|
|
s6 = pd.Series([1, 3, 2], dtype='category')
|
|
|
|
# mixed dtype, values are all in categories => not-category
|
|
exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2])
|
|
res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
res = s1.append([s2, s3, s4, s5, s6], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3])
|
|
res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
res = s6.append([s5, s4, s3, s2, s1], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
def test_concat_categorical_ordered(self):
|
|
# GH 13524
|
|
|
|
s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True))
|
|
s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True))
|
|
|
|
exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True))
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan],
|
|
ordered=True))
|
|
tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp)
|
|
|
|
def test_concat_categorical_coercion_nan(self):
|
|
# GH 13524
|
|
|
|
# some edge cases
|
|
# category + not-category => not category
|
|
s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64),
|
|
dtype='category')
|
|
s2 = pd.Series([np.nan, 1])
|
|
|
|
exp = pd.Series([np.nan, np.nan, np.nan, 1])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
s1 = pd.Series([1, np.nan], dtype='category')
|
|
s2 = pd.Series([np.nan, np.nan])
|
|
|
|
exp = pd.Series([1, np.nan, np.nan, np.nan])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
# mixed dtype, all nan-likes => not-category
|
|
s1 = pd.Series([np.nan, np.nan], dtype='category')
|
|
s2 = pd.Series([np.nan, np.nan])
|
|
|
|
exp = pd.Series([np.nan, np.nan, np.nan, np.nan])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
|
|
|
|
# all category nan-likes => category
|
|
s1 = pd.Series([np.nan, np.nan], dtype='category')
|
|
s2 = pd.Series([np.nan, np.nan], dtype='category')
|
|
|
|
exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category')
|
|
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
def test_concat_categorical_empty(self):
|
|
# GH 13524
|
|
|
|
s1 = pd.Series([], dtype='category')
|
|
s2 = pd.Series([1, 2], dtype='category')
|
|
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
|
|
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), s2)
|
|
|
|
s1 = pd.Series([], dtype='category')
|
|
s2 = pd.Series([], dtype='category')
|
|
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
|
|
|
|
s1 = pd.Series([], dtype='category')
|
|
s2 = pd.Series([], dtype='object')
|
|
|
|
# different dtype => not-category
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), s2)
|
|
|
|
s1 = pd.Series([], dtype='category')
|
|
s2 = pd.Series([np.nan, np.nan])
|
|
|
|
# empty Series is ignored
|
|
exp = pd.Series([np.nan, np.nan])
|
|
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
|
|
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
|
|
|
|
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
|
|
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
|
|
|
|
|
|
class TestAppend(ConcatenateBase):
|
|
|
|
def test_append(self, sort):
|
|
begin_index = self.frame.index[:5]
|
|
end_index = self.frame.index[5:]
|
|
|
|
begin_frame = self.frame.reindex(begin_index)
|
|
end_frame = self.frame.reindex(end_index)
|
|
|
|
appended = begin_frame.append(end_frame)
|
|
tm.assert_almost_equal(appended['A'], self.frame['A'])
|
|
|
|
del end_frame['A']
|
|
partial_appended = begin_frame.append(end_frame, sort=sort)
|
|
assert 'A' in partial_appended
|
|
|
|
partial_appended = end_frame.append(begin_frame, sort=sort)
|
|
assert 'A' in partial_appended
|
|
|
|
# mixed type handling
|
|
appended = self.mixed_frame[:5].append(self.mixed_frame[5:])
|
|
tm.assert_frame_equal(appended, self.mixed_frame)
|
|
|
|
# what to test here
|
|
mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=sort)
|
|
mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:],
|
|
sort=sort)
|
|
|
|
# all equal except 'foo' column
|
|
tm.assert_frame_equal(
|
|
mixed_appended.reindex(columns=['A', 'B', 'C', 'D']),
|
|
mixed_appended2.reindex(columns=['A', 'B', 'C', 'D']))
|
|
|
|
# append empty
|
|
empty = DataFrame({})
|
|
|
|
appended = self.frame.append(empty)
|
|
tm.assert_frame_equal(self.frame, appended)
|
|
assert appended is not self.frame
|
|
|
|
appended = empty.append(self.frame)
|
|
tm.assert_frame_equal(self.frame, appended)
|
|
assert appended is not self.frame
|
|
|
|
# Overlap
|
|
with pytest.raises(ValueError):
|
|
self.frame.append(self.frame, verify_integrity=True)
|
|
|
|
# see gh-6129: new columns
|
|
df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}})
|
|
row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z')
|
|
expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {
|
|
'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}})
|
|
result = df.append(row)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_append_length0_frame(self, sort):
|
|
df = DataFrame(columns=['A', 'B', 'C'])
|
|
df3 = DataFrame(index=[0, 1], columns=['A', 'B'])
|
|
df5 = df.append(df3, sort=sort)
|
|
|
|
expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C'])
|
|
assert_frame_equal(df5, expected)
|
|
|
|
def test_append_records(self):
|
|
arr1 = np.zeros((2,), dtype=('i4,f4,a10'))
|
|
arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")]
|
|
|
|
arr2 = np.zeros((3,), dtype=('i4,f4,a10'))
|
|
arr2[:] = [(3, 4., 'foo'),
|
|
(5, 6., "bar"),
|
|
(7., 8., 'baz')]
|
|
|
|
df1 = DataFrame(arr1)
|
|
df2 = DataFrame(arr2)
|
|
|
|
result = df1.append(df2, ignore_index=True)
|
|
expected = DataFrame(np.concatenate((arr1, arr2)))
|
|
assert_frame_equal(result, expected)
|
|
|
|
# rewrite sort fixture, since we also want to test default of None
|
|
def test_append_sorts(self, sort_with_none):
|
|
df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
|
|
df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3])
|
|
|
|
if sort_with_none is None:
|
|
# only warn if not explicitly specified
|
|
# don't check stacklevel since its set for concat, and append
|
|
# has an extra stack.
|
|
ctx = tm.assert_produces_warning(FutureWarning,
|
|
check_stacklevel=False)
|
|
else:
|
|
ctx = tm.assert_produces_warning(None)
|
|
|
|
with ctx:
|
|
result = df1.append(df2, sort=sort_with_none)
|
|
|
|
# for None / True
|
|
expected = pd.DataFrame({"b": [1, 2, None, None],
|
|
"a": [1, 2, 1, 2],
|
|
"c": [None, None, 3, 4]},
|
|
columns=['a', 'b', 'c'])
|
|
if sort_with_none is False:
|
|
expected = expected[['b', 'a', 'c']]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_append_different_columns(self, sort):
|
|
df = DataFrame({'bools': np.random.randn(10) > 0,
|
|
'ints': np.random.randint(0, 10, 10),
|
|
'floats': np.random.randn(10),
|
|
'strings': ['foo', 'bar'] * 5})
|
|
|
|
a = df[:5].loc[:, ['bools', 'ints', 'floats']]
|
|
b = df[5:].loc[:, ['strings', 'ints', 'floats']]
|
|
|
|
appended = a.append(b, sort=sort)
|
|
assert isna(appended['strings'][0:4]).all()
|
|
assert isna(appended['bools'][5:]).all()
|
|
|
|
def test_append_many(self, sort):
|
|
chunks = [self.frame[:5], self.frame[5:10],
|
|
self.frame[10:15], self.frame[15:]]
|
|
|
|
result = chunks[0].append(chunks[1:])
|
|
tm.assert_frame_equal(result, self.frame)
|
|
|
|
chunks[-1] = chunks[-1].copy()
|
|
chunks[-1]['foo'] = 'bar'
|
|
result = chunks[0].append(chunks[1:], sort=sort)
|
|
tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame)
|
|
assert (result['foo'][15:] == 'bar').all()
|
|
assert result['foo'][:15].isna().all()
|
|
|
|
def test_append_preserve_index_name(self):
|
|
# #980
|
|
df1 = DataFrame(data=None, columns=['A', 'B', 'C'])
|
|
df1 = df1.set_index(['A'])
|
|
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]],
|
|
columns=['A', 'B', 'C'])
|
|
df2 = df2.set_index(['A'])
|
|
|
|
result = df1.append(df2)
|
|
assert result.index.name == 'A'
|
|
|
|
indexes_can_append = [
|
|
pd.RangeIndex(3),
|
|
pd.Index([4, 5, 6]),
|
|
pd.Index([4.5, 5.5, 6.5]),
|
|
pd.Index(list('abc')),
|
|
pd.CategoricalIndex('A B C'.split()),
|
|
pd.CategoricalIndex('D E F'.split(), ordered=True),
|
|
pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0),
|
|
dt.datetime(2013, 1, 3, 6, 10),
|
|
dt.datetime(2013, 1, 3, 7, 12)]),
|
|
]
|
|
|
|
indexes_cannot_append_with_other = [
|
|
pd.IntervalIndex.from_breaks([0, 1, 2, 3]),
|
|
pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]),
|
|
]
|
|
|
|
all_indexes = indexes_can_append + indexes_cannot_append_with_other
|
|
|
|
@pytest.mark.parametrize("index",
|
|
all_indexes,
|
|
ids=lambda x: x.__class__.__name__)
|
|
def test_append_same_columns_type(self, index):
|
|
# GH18359
|
|
|
|
# df wider than ser
|
|
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
|
|
ser_index = index[:2]
|
|
ser = pd.Series([7, 8], index=ser_index, name=2)
|
|
result = df.append(ser)
|
|
expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]],
|
|
index=[0, 1, 2],
|
|
columns=index)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# ser wider than df
|
|
ser_index = index
|
|
index = index[:2]
|
|
df = pd.DataFrame([[1, 2], [4, 5]], columns=index)
|
|
ser = pd.Series([7, 8, 9], index=ser_index, name=2)
|
|
result = df.append(ser)
|
|
expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
|
|
index=[0, 1, 2],
|
|
columns=ser_index)
|
|
assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("df_columns, series_index",
|
|
combinations(indexes_can_append, r=2),
|
|
ids=lambda x: x.__class__.__name__)
|
|
def test_append_different_columns_types(self, df_columns, series_index):
|
|
# GH18359
|
|
# See also test 'test_append_different_columns_types_raises' below
|
|
# for errors raised when appending
|
|
|
|
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
|
|
ser = pd.Series([7, 8, 9], index=series_index, name=2)
|
|
|
|
result = df.append(ser)
|
|
idx_diff = ser.index.difference(df_columns)
|
|
combined_columns = Index(df_columns.tolist()).append(idx_diff)
|
|
expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan],
|
|
[4, 5, 6, np.nan, np.nan, np.nan],
|
|
[np.nan, np.nan, np.nan, 7, 8, 9]],
|
|
index=[0, 1, 2],
|
|
columns=combined_columns)
|
|
assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize(
|
|
"index_can_append, index_cannot_append_with_other",
|
|
product(indexes_can_append, indexes_cannot_append_with_other),
|
|
ids=lambda x: x.__class__.__name__)
|
|
def test_append_different_columns_types_raises(
|
|
self, index_can_append, index_cannot_append_with_other):
|
|
# GH18359
|
|
# Dataframe.append will raise if IntervalIndex/MultiIndex appends
|
|
# or is appended to a different index type
|
|
#
|
|
# See also test 'test_append_different_columns_types' above for
|
|
# appending without raising.
|
|
|
|
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append)
|
|
ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other,
|
|
name=2)
|
|
with pytest.raises(TypeError):
|
|
df.append(ser)
|
|
|
|
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]],
|
|
columns=index_cannot_append_with_other)
|
|
ser = pd.Series([7, 8, 9], index=index_can_append, name=2)
|
|
with pytest.raises(TypeError):
|
|
df.append(ser)
|
|
|
|
def test_append_dtype_coerce(self, sort):
|
|
|
|
# GH 4993
|
|
# appending with datetime will incorrectly convert datetime64
|
|
|
|
df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),
|
|
dt.datetime(2013, 1, 2, 0, 0)],
|
|
columns=['start_time'])
|
|
df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0),
|
|
dt.datetime(2013, 1, 3, 6, 10)],
|
|
[dt.datetime(2013, 1, 4, 0, 0),
|
|
dt.datetime(2013, 1, 4, 7, 10)]],
|
|
columns=['start_time', 'end_time'])
|
|
|
|
expected = concat([Series([pd.NaT,
|
|
pd.NaT,
|
|
dt.datetime(2013, 1, 3, 6, 10),
|
|
dt.datetime(2013, 1, 4, 7, 10)],
|
|
name='end_time'),
|
|
Series([dt.datetime(2013, 1, 1, 0, 0),
|
|
dt.datetime(2013, 1, 2, 0, 0),
|
|
dt.datetime(2013, 1, 3, 0, 0),
|
|
dt.datetime(2013, 1, 4, 0, 0)],
|
|
name='start_time')],
|
|
axis=1, sort=sort)
|
|
result = df1.append(df2, ignore_index=True, sort=sort)
|
|
if sort:
|
|
expected = expected[['end_time', 'start_time']]
|
|
else:
|
|
expected = expected[['start_time', 'end_time']]
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_append_missing_column_proper_upcast(self, sort):
|
|
df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')})
|
|
df2 = DataFrame({'B': np.array([True, False, True, False],
|
|
dtype=bool)})
|
|
|
|
appended = df1.append(df2, ignore_index=True, sort=sort)
|
|
assert appended['A'].dtype == 'f8'
|
|
assert appended['B'].dtype == 'O'
|
|
|
|
|
|
class TestConcatenate(ConcatenateBase):
|
|
|
|
def test_concat_copy(self):
|
|
df = DataFrame(np.random.randn(4, 3))
|
|
df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1))
|
|
df3 = DataFrame({5: 'foo'}, index=range(4))
|
|
|
|
# These are actual copies.
|
|
result = concat([df, df2, df3], axis=1, copy=True)
|
|
|
|
for b in result._data.blocks:
|
|
assert b.values.base is None
|
|
|
|
# These are the same.
|
|
result = concat([df, df2, df3], axis=1, copy=False)
|
|
|
|
for b in result._data.blocks:
|
|
if b.is_float:
|
|
assert b.values.base is df._data.blocks[0].values.base
|
|
elif b.is_integer:
|
|
assert b.values.base is df2._data.blocks[0].values.base
|
|
elif b.is_object:
|
|
assert b.values.base is not None
|
|
|
|
# Float block was consolidated.
|
|
df4 = DataFrame(np.random.randn(4, 1))
|
|
result = concat([df, df2, df3, df4], axis=1, copy=False)
|
|
for b in result._data.blocks:
|
|
if b.is_float:
|
|
assert b.values.base is None
|
|
elif b.is_integer:
|
|
assert b.values.base is df2._data.blocks[0].values.base
|
|
elif b.is_object:
|
|
assert b.values.base is not None
|
|
|
|
def test_concat_with_group_keys(self):
|
|
df = DataFrame(np.random.randn(4, 3))
|
|
df2 = DataFrame(np.random.randn(4, 4))
|
|
|
|
# axis=0
|
|
df = DataFrame(np.random.randn(3, 4))
|
|
df2 = DataFrame(np.random.randn(4, 4))
|
|
|
|
result = concat([df, df2], keys=[0, 1])
|
|
exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1],
|
|
[0, 1, 2, 0, 1, 2, 3]])
|
|
expected = DataFrame(np.r_[df.values, df2.values],
|
|
index=exp_index)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = concat([df, df], keys=[0, 1])
|
|
exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
|
|
[0, 1, 2, 0, 1, 2]])
|
|
expected = DataFrame(np.r_[df.values, df.values],
|
|
index=exp_index2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# axis=1
|
|
df = DataFrame(np.random.randn(4, 3))
|
|
df2 = DataFrame(np.random.randn(4, 4))
|
|
|
|
result = concat([df, df2], keys=[0, 1], axis=1)
|
|
expected = DataFrame(np.c_[df.values, df2.values],
|
|
columns=exp_index)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = concat([df, df], keys=[0, 1], axis=1)
|
|
expected = DataFrame(np.c_[df.values, df.values],
|
|
columns=exp_index2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_keys_specific_levels(self):
|
|
df = DataFrame(np.random.randn(10, 4))
|
|
pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]]
|
|
level = ['three', 'two', 'one', 'zero']
|
|
result = concat(pieces, axis=1, keys=['one', 'two', 'three'],
|
|
levels=[level],
|
|
names=['group_key'])
|
|
|
|
tm.assert_index_equal(result.columns.levels[0],
|
|
Index(level, name='group_key'))
|
|
assert result.columns.names[0] == 'group_key'
|
|
|
|
def test_concat_dataframe_keys_bug(self, sort):
|
|
t1 = DataFrame({
|
|
'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'],
|
|
name='id'))})
|
|
t2 = DataFrame({
|
|
'value': Series([7, 8], index=Index(['a', 'b'], name='id'))})
|
|
|
|
# it works
|
|
result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort)
|
|
assert list(result.columns) == [('t1', 'value'), ('t2', 'value')]
|
|
|
|
def test_concat_series_partial_columns_names(self):
|
|
# GH10698
|
|
foo = Series([1, 2], name='foo')
|
|
bar = Series([1, 2])
|
|
baz = Series([4, 5])
|
|
|
|
result = concat([foo, bar, baz], axis=1)
|
|
expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [
|
|
4, 5]}, columns=['foo', 0, 1])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = concat([foo, bar, baz], axis=1, keys=[
|
|
'red', 'blue', 'yellow'])
|
|
expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [
|
|
4, 5]}, columns=['red', 'blue', 'yellow'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = concat([foo, bar, baz], axis=1, ignore_index=True)
|
|
expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_dict(self):
|
|
frames = {'foo': DataFrame(np.random.randn(4, 3)),
|
|
'bar': DataFrame(np.random.randn(4, 3)),
|
|
'baz': DataFrame(np.random.randn(4, 3)),
|
|
'qux': DataFrame(np.random.randn(4, 3))}
|
|
|
|
sorted_keys = sorted(frames)
|
|
|
|
result = concat(frames)
|
|
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = concat(frames, axis=1)
|
|
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys,
|
|
axis=1)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
keys = ['baz', 'foo', 'bar']
|
|
result = concat(frames, keys=keys)
|
|
expected = concat([frames[k] for k in keys], keys=keys)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_ignore_index(self, sort):
|
|
frame1 = DataFrame({"test1": ["a", "b", "c"],
|
|
"test2": [1, 2, 3],
|
|
"test3": [4.5, 3.2, 1.2]})
|
|
frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
|
|
frame1.index = Index(["x", "y", "z"])
|
|
frame2.index = Index(["x", "y", "q"])
|
|
|
|
v1 = concat([frame1, frame2], axis=1,
|
|
ignore_index=True, sort=sort)
|
|
|
|
nan = np.nan
|
|
expected = DataFrame([[nan, nan, nan, 4.3],
|
|
['a', 1, 4.5, 5.2],
|
|
['b', 2, 3.2, 2.2],
|
|
['c', 3, 1.2, nan]],
|
|
index=Index(["q", "x", "y", "z"]))
|
|
if not sort:
|
|
expected = expected.loc[['x', 'y', 'z', 'q']]
|
|
|
|
tm.assert_frame_equal(v1, expected)
|
|
|
|
def test_concat_multiindex_with_keys(self):
|
|
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
|
|
['one', 'two', 'three']],
|
|
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
|
|
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
|
|
names=['first', 'second'])
|
|
frame = DataFrame(np.random.randn(10, 3), index=index,
|
|
columns=Index(['A', 'B', 'C'], name='exp'))
|
|
result = concat([frame, frame], keys=[0, 1], names=['iteration'])
|
|
|
|
assert result.index.names == ('iteration',) + index.names
|
|
tm.assert_frame_equal(result.loc[0], frame)
|
|
tm.assert_frame_equal(result.loc[1], frame)
|
|
assert result.index.nlevels == 3
|
|
|
|
def test_concat_multiindex_with_tz(self):
|
|
# GH 6606
|
|
df = DataFrame({'dt': [datetime(2014, 1, 1),
|
|
datetime(2014, 1, 2),
|
|
datetime(2014, 1, 3)],
|
|
'b': ['A', 'B', 'C'],
|
|
'c': [1, 2, 3], 'd': [4, 5, 6]})
|
|
df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific'))
|
|
df = df.set_index(['dt', 'b'])
|
|
|
|
exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02',
|
|
'2014-01-03'] * 2,
|
|
tz='US/Pacific', name='dt')
|
|
exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b')
|
|
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
|
expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2},
|
|
index=exp_idx, columns=['c', 'd'])
|
|
|
|
result = concat([df, df])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_multiindex_with_none_in_index_names(self):
|
|
# GH 15787
|
|
index = pd.MultiIndex.from_product([[1], range(5)],
|
|
names=['level1', None])
|
|
df = pd.DataFrame({'col': range(5)}, index=index, dtype=np.int32)
|
|
|
|
result = concat([df, df], keys=[1, 2], names=['level2'])
|
|
index = pd.MultiIndex.from_product([[1, 2], [1], range(5)],
|
|
names=['level2', 'level1', None])
|
|
expected = pd.DataFrame({'col': list(range(5)) * 2},
|
|
index=index, dtype=np.int32)
|
|
assert_frame_equal(result, expected)
|
|
|
|
result = concat([df, df[:2]], keys=[1, 2], names=['level2'])
|
|
level2 = [1] * 5 + [2] * 2
|
|
level1 = [1] * 7
|
|
no_name = list(range(5)) + list(range(2))
|
|
tuples = list(zip(level2, level1, no_name))
|
|
index = pd.MultiIndex.from_tuples(tuples,
|
|
names=['level2', 'level1', None])
|
|
expected = pd.DataFrame({'col': no_name}, index=index,
|
|
dtype=np.int32)
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_concat_keys_and_levels(self):
|
|
df = DataFrame(np.random.randn(1, 3))
|
|
df2 = DataFrame(np.random.randn(1, 4))
|
|
|
|
levels = [['foo', 'baz'], ['one', 'two']]
|
|
names = ['first', 'second']
|
|
result = concat([df, df2, df, df2],
|
|
keys=[('foo', 'one'), ('foo', 'two'),
|
|
('baz', 'one'), ('baz', 'two')],
|
|
levels=levels,
|
|
names=names)
|
|
expected = concat([df, df2, df, df2])
|
|
exp_index = MultiIndex(levels=levels + [[0]],
|
|
labels=[[0, 0, 1, 1], [0, 1, 0, 1],
|
|
[0, 0, 0, 0]],
|
|
names=names + [None])
|
|
expected.index = exp_index
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# no names
|
|
result = concat([df, df2, df, df2],
|
|
keys=[('foo', 'one'), ('foo', 'two'),
|
|
('baz', 'one'), ('baz', 'two')],
|
|
levels=levels)
|
|
assert result.index.names == (None,) * 3
|
|
|
|
# no levels
|
|
result = concat([df, df2, df, df2],
|
|
keys=[('foo', 'one'), ('foo', 'two'),
|
|
('baz', 'one'), ('baz', 'two')],
|
|
names=['first', 'second'])
|
|
assert result.index.names == ('first', 'second') + (None,)
|
|
tm.assert_index_equal(result.index.levels[0],
|
|
Index(['baz', 'foo'], name='first'))
|
|
|
|
def test_concat_keys_levels_no_overlap(self):
|
|
# GH #1406
|
|
df = DataFrame(np.random.randn(1, 3), index=['a'])
|
|
df2 = DataFrame(np.random.randn(1, 4), index=['b'])
|
|
|
|
pytest.raises(ValueError, concat, [df, df],
|
|
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
|
|
|
|
pytest.raises(ValueError, concat, [df, df2],
|
|
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
|
|
|
|
def test_concat_rename_index(self):
|
|
a = DataFrame(np.random.rand(3, 3),
|
|
columns=list('ABC'),
|
|
index=Index(list('abc'), name='index_a'))
|
|
b = DataFrame(np.random.rand(3, 3),
|
|
columns=list('ABC'),
|
|
index=Index(list('abc'), name='index_b'))
|
|
|
|
result = concat([a, b], keys=['key0', 'key1'],
|
|
names=['lvl0', 'lvl1'])
|
|
|
|
exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0'])
|
|
names = list(exp.index.names)
|
|
names[1] = 'lvl1'
|
|
exp.index.set_names(names, inplace=True)
|
|
|
|
tm.assert_frame_equal(result, exp)
|
|
assert result.index.names == exp.index.names
|
|
|
|
def test_crossed_dtypes_weird_corner(self):
|
|
columns = ['A', 'B', 'C', 'D']
|
|
df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'),
|
|
'B': np.array([1, 2, 3, 4], dtype='i8'),
|
|
'C': np.array([1, 2, 3, 4], dtype='f8'),
|
|
'D': np.array([1, 2, 3, 4], dtype='i8')},
|
|
columns=columns)
|
|
|
|
df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'),
|
|
'B': np.array([1, 2, 3, 4], dtype='f8'),
|
|
'C': np.array([1, 2, 3, 4], dtype='i8'),
|
|
'D': np.array([1, 2, 3, 4], dtype='f8')},
|
|
columns=columns)
|
|
|
|
appended = df1.append(df2, ignore_index=True)
|
|
expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0),
|
|
columns=columns)
|
|
tm.assert_frame_equal(appended, expected)
|
|
|
|
df = DataFrame(np.random.randn(1, 3), index=['a'])
|
|
df2 = DataFrame(np.random.randn(1, 4), index=['b'])
|
|
result = concat(
|
|
[df, df2], keys=['one', 'two'], names=['first', 'second'])
|
|
assert result.index.names == ('first', 'second')
|
|
|
|
def test_dups_index(self):
|
|
# GH 4771
|
|
|
|
# single dtypes
|
|
df = DataFrame(np.random.randint(0, 10, size=40).reshape(
|
|
10, 4), columns=['A', 'A', 'C', 'C'])
|
|
|
|
result = concat([df, df], axis=1)
|
|
assert_frame_equal(result.iloc[:, :4], df)
|
|
assert_frame_equal(result.iloc[:, 4:], df)
|
|
|
|
result = concat([df, df], axis=0)
|
|
assert_frame_equal(result.iloc[:10], df)
|
|
assert_frame_equal(result.iloc[10:], df)
|
|
|
|
# multi dtypes
|
|
df = concat([DataFrame(np.random.randn(10, 4),
|
|
columns=['A', 'A', 'B', 'B']),
|
|
DataFrame(np.random.randint(0, 10, size=20)
|
|
.reshape(10, 2),
|
|
columns=['A', 'C'])],
|
|
axis=1)
|
|
|
|
result = concat([df, df], axis=1)
|
|
assert_frame_equal(result.iloc[:, :6], df)
|
|
assert_frame_equal(result.iloc[:, 6:], df)
|
|
|
|
result = concat([df, df], axis=0)
|
|
assert_frame_equal(result.iloc[:10], df)
|
|
assert_frame_equal(result.iloc[10:], df)
|
|
|
|
# append
|
|
result = df.iloc[0:8, :].append(df.iloc[8:])
|
|
assert_frame_equal(result, df)
|
|
|
|
result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10])
|
|
assert_frame_equal(result, df)
|
|
|
|
expected = concat([df, df], axis=0)
|
|
result = df.append(df)
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_with_mixed_tuples(self, sort):
|
|
# 10697
|
|
# columns have mixed tuples, so handle properly
|
|
df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2))
|
|
df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2))
|
|
|
|
# it works
|
|
concat([df1, df2], sort=sort)
|
|
|
|
def test_handle_empty_objects(self, sort):
|
|
df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
|
|
|
|
baz = df[:5].copy()
|
|
baz['foo'] = 'bar'
|
|
empty = df[5:5]
|
|
|
|
frames = [baz, empty, empty, df[5:]]
|
|
concatted = concat(frames, axis=0, sort=sort)
|
|
|
|
expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo'])
|
|
expected['foo'] = expected['foo'].astype('O')
|
|
expected.loc[0:4, 'foo'] = 'bar'
|
|
|
|
tm.assert_frame_equal(concatted, expected)
|
|
|
|
# empty as first element with time series
|
|
# GH3259
|
|
df = DataFrame(dict(A=range(10000)), index=date_range(
|
|
'20130101', periods=10000, freq='s'))
|
|
empty = DataFrame()
|
|
result = concat([df, empty], axis=1)
|
|
assert_frame_equal(result, df)
|
|
result = concat([empty, df], axis=1)
|
|
assert_frame_equal(result, df)
|
|
|
|
result = concat([df, empty])
|
|
assert_frame_equal(result, df)
|
|
result = concat([empty, df])
|
|
assert_frame_equal(result, df)
|
|
|
|
def test_concat_mixed_objs(self):
|
|
|
|
# concat mixed series/frames
|
|
# G2385
|
|
|
|
# axis 1
|
|
index = date_range('01-Jan-2013', periods=10, freq='H')
|
|
arr = np.arange(10, dtype='int64')
|
|
s1 = Series(arr, index=index)
|
|
s2 = Series(arr, index=index)
|
|
df = DataFrame(arr.reshape(-1, 1), index=index)
|
|
|
|
expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2),
|
|
index=index, columns=[0, 0])
|
|
result = concat([df, df], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2),
|
|
index=index, columns=[0, 1])
|
|
result = concat([s1, s2], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
|
|
index=index, columns=[0, 1, 2])
|
|
result = concat([s1, s2, s1], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5),
|
|
index=index, columns=[0, 0, 1, 2, 3])
|
|
result = concat([s1, df, s2, s2, s1], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# with names
|
|
s1.name = 'foo'
|
|
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
|
|
index=index, columns=['foo', 0, 0])
|
|
result = concat([s1, df, s2], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
s2.name = 'bar'
|
|
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
|
|
index=index, columns=['foo', 0, 'bar'])
|
|
result = concat([s1, df, s2], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# ignore index
|
|
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
|
|
index=index, columns=[0, 1, 2])
|
|
result = concat([s1, df, s2], axis=1, ignore_index=True)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# axis 0
|
|
expected = DataFrame(np.tile(arr, 3).reshape(-1, 1),
|
|
index=index.tolist() * 3, columns=[0])
|
|
result = concat([s1, df, s2])
|
|
assert_frame_equal(result, expected)
|
|
|
|
expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0])
|
|
result = concat([s1, df, s2], ignore_index=True)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# invalid concatente of mixed dims
|
|
with catch_warnings(record=True):
|
|
panel = tm.makePanel()
|
|
pytest.raises(ValueError, lambda: concat([panel, s1], axis=1))
|
|
|
|
def test_empty_dtype_coerce(self):
|
|
|
|
# xref to #12411
|
|
# xref to #12045
|
|
# xref to #11594
|
|
# see below
|
|
|
|
# 10571
|
|
df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b'])
|
|
df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b'])
|
|
result = concat([df1, df2])
|
|
expected = df1.dtypes
|
|
tm.assert_series_equal(result.dtypes, expected)
|
|
|
|
def test_dtype_coerceion(self):
|
|
|
|
# 12411
|
|
df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'),
|
|
pd.NaT]})
|
|
|
|
result = concat([df.iloc[[0]], df.iloc[[1]]])
|
|
tm.assert_series_equal(result.dtypes, df.dtypes)
|
|
|
|
# 12045
|
|
import datetime
|
|
df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
|
|
datetime.datetime(1012, 1, 2)]})
|
|
result = concat([df.iloc[[0]], df.iloc[[1]]])
|
|
tm.assert_series_equal(result.dtypes, df.dtypes)
|
|
|
|
# 11594
|
|
df = DataFrame({'text': ['some words'] + [None] * 9})
|
|
result = concat([df.iloc[[0]], df.iloc[[1]]])
|
|
tm.assert_series_equal(result.dtypes, df.dtypes)
|
|
|
|
def test_panel_concat_other_axes(self):
|
|
with catch_warnings(record=True):
|
|
panel = tm.makePanel()
|
|
|
|
p1 = panel.iloc[:, :5, :]
|
|
p2 = panel.iloc[:, 5:, :]
|
|
|
|
result = concat([p1, p2], axis=1)
|
|
tm.assert_panel_equal(result, panel)
|
|
|
|
p1 = panel.iloc[:, :, :2]
|
|
p2 = panel.iloc[:, :, 2:]
|
|
|
|
result = concat([p1, p2], axis=2)
|
|
tm.assert_panel_equal(result, panel)
|
|
|
|
# if things are a bit misbehaved
|
|
p1 = panel.iloc[:2, :, :2]
|
|
p2 = panel.iloc[:, :, 2:]
|
|
p1['ItemC'] = 'baz'
|
|
|
|
result = concat([p1, p2], axis=2)
|
|
|
|
expected = panel.copy()
|
|
expected['ItemC'] = expected['ItemC'].astype('O')
|
|
expected.loc['ItemC', :, :2] = 'baz'
|
|
tm.assert_panel_equal(result, expected)
|
|
|
|
def test_panel_concat_buglet(self, sort):
|
|
with catch_warnings(record=True):
|
|
# #2257
|
|
def make_panel():
|
|
index = 5
|
|
cols = 3
|
|
|
|
def df():
|
|
return DataFrame(np.random.randn(index, cols),
|
|
index=["I%s" % i for i in range(index)],
|
|
columns=["C%s" % i for i in range(cols)])
|
|
return Panel(dict(("Item%s" % x, df())
|
|
for x in ['A', 'B', 'C']))
|
|
|
|
panel1 = make_panel()
|
|
panel2 = make_panel()
|
|
|
|
panel2 = panel2.rename_axis(dict((x, "%s_1" % x)
|
|
for x in panel2.major_axis),
|
|
axis=1)
|
|
|
|
panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1)
|
|
panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2)
|
|
|
|
# it works!
|
|
concat([panel1, panel3], axis=1, verify_integrity=True, sort=sort)
|
|
|
|
def test_concat_series(self):
|
|
|
|
ts = tm.makeTimeSeries()
|
|
ts.name = 'foo'
|
|
|
|
pieces = [ts[:5], ts[5:15], ts[15:]]
|
|
|
|
result = concat(pieces)
|
|
tm.assert_series_equal(result, ts)
|
|
assert result.name == ts.name
|
|
|
|
result = concat(pieces, keys=[0, 1, 2])
|
|
expected = ts.copy()
|
|
|
|
ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]'))
|
|
|
|
exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]),
|
|
np.arange(len(ts))]
|
|
exp_index = MultiIndex(levels=[[0, 1, 2], ts.index],
|
|
labels=exp_labels)
|
|
expected.index = exp_index
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_concat_series_axis1(self, sort=sort):
|
|
ts = tm.makeTimeSeries()
|
|
|
|
pieces = [ts[:-2], ts[2:], ts[2:-2]]
|
|
|
|
result = concat(pieces, axis=1)
|
|
expected = DataFrame(pieces).T
|
|
assert_frame_equal(result, expected)
|
|
|
|
result = concat(pieces, keys=['A', 'B', 'C'], axis=1)
|
|
expected = DataFrame(pieces, index=['A', 'B', 'C']).T
|
|
assert_frame_equal(result, expected)
|
|
|
|
# preserve series names, #2489
|
|
s = Series(randn(5), name='A')
|
|
s2 = Series(randn(5), name='B')
|
|
|
|
result = concat([s, s2], axis=1)
|
|
expected = DataFrame({'A': s, 'B': s2})
|
|
assert_frame_equal(result, expected)
|
|
|
|
s2.name = None
|
|
result = concat([s, s2], axis=1)
|
|
tm.assert_index_equal(result.columns,
|
|
Index(['A', 0], dtype='object'))
|
|
|
|
# must reindex, #2603
|
|
s = Series(randn(3), index=['c', 'a', 'b'], name='A')
|
|
s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
|
|
result = concat([s, s2], axis=1, sort=sort)
|
|
expected = DataFrame({'A': s, 'B': s2})
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_concat_single_with_key(self):
|
|
df = DataFrame(np.random.randn(10, 4))
|
|
|
|
result = concat([df], keys=['foo'])
|
|
expected = concat([df, df], keys=['foo', 'bar'])
|
|
tm.assert_frame_equal(result, expected[:10])
|
|
|
|
def test_concat_exclude_none(self):
|
|
df = DataFrame(np.random.randn(10, 4))
|
|
|
|
pieces = [df[:5], None, None, df[5:]]
|
|
result = concat(pieces)
|
|
tm.assert_frame_equal(result, df)
|
|
pytest.raises(ValueError, concat, [None, None])
|
|
|
|
def test_concat_datetime64_block(self):
|
|
from pandas.core.indexes.datetimes import date_range
|
|
|
|
rng = date_range('1/1/2000', periods=10)
|
|
|
|
df = DataFrame({'time': rng})
|
|
|
|
result = concat([df, df])
|
|
assert (result.iloc[:10]['time'] == rng).all()
|
|
assert (result.iloc[10:]['time'] == rng).all()
|
|
|
|
def test_concat_timedelta64_block(self):
|
|
from pandas import to_timedelta
|
|
|
|
rng = to_timedelta(np.arange(10), unit='s')
|
|
|
|
df = DataFrame({'time': rng})
|
|
|
|
result = concat([df, df])
|
|
assert (result.iloc[:10]['time'] == rng).all()
|
|
assert (result.iloc[10:]['time'] == rng).all()
|
|
|
|
def test_concat_keys_with_none(self):
|
|
# #1649
|
|
df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
|
|
|
|
result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0))
|
|
expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = concat([None, df0, df0[:2], df0[:1], df0],
|
|
keys=['a', 'b', 'c', 'd', 'e'])
|
|
expected = concat([df0, df0[:2], df0[:1], df0],
|
|
keys=['b', 'c', 'd', 'e'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_concat_bug_1719(self):
|
|
ts1 = tm.makeTimeSeries()
|
|
ts2 = tm.makeTimeSeries()[::2]
|
|
|
|
# to join with union
|
|
# these two are of different length!
|
|
left = concat([ts1, ts2], join='outer', axis=1)
|
|
right = concat([ts2, ts1], join='outer', axis=1)
|
|
|
|
assert len(left) == len(right)
|
|
|
|
def test_concat_bug_2972(self):
|
|
ts0 = Series(np.zeros(5))
|
|
ts1 = Series(np.ones(5))
|
|
ts0.name = ts1.name = 'same name'
|
|
result = concat([ts0, ts1], axis=1)
|
|
|
|
expected = DataFrame({0: ts0, 1: ts1})
|
|
expected.columns = ['same name', 'same name']
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_concat_bug_3602(self):
|
|
|
|
# GH 3602, duplicate columns
|
|
df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'prc': [6, 6, 6, 6],
|
|
'stringvar': ['rrr', 'rrr', 'rrr', 'rrr']})
|
|
df2 = DataFrame({'C': [9, 10, 11, 12], 'misc': [1, 2, 3, 4],
|
|
'prc': [6, 6, 6, 6]})
|
|
expected = DataFrame([[0, 6, 'rrr', 9, 1, 6],
|
|
[0, 6, 'rrr', 10, 2, 6],
|
|
[0, 6, 'rrr', 11, 3, 6],
|
|
[0, 6, 'rrr', 12, 4, 6]])
|
|
expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc']
|
|
|
|
result = concat([df1, df2], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_concat_inner_join_empty(self):
|
|
# GH 15328
|
|
df_empty = pd.DataFrame()
|
|
df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64')
|
|
df_expected = pd.DataFrame({'a': []}, index=[], dtype='int64')
|
|
|
|
for how, expected in [('inner', df_expected), ('outer', df_a)]:
|
|
result = pd.concat([df_a, df_empty], axis=1, join=how)
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_concat_series_axis1_same_names_ignore_index(self):
|
|
dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1]
|
|
s1 = Series(randn(len(dates)), index=dates, name='value')
|
|
s2 = Series(randn(len(dates)), index=dates, name='value')
|
|
|
|
result = concat([s1, s2], axis=1, ignore_index=True)
|
|
expected = Index([0, 1])
|
|
|
|
tm.assert_index_equal(result.columns, expected)
|
|
|
|
def test_concat_iterables(self):
|
|
from collections import deque, Iterable
|
|
|
|
# GH8645 check concat works with tuples, list, generators, and weird
|
|
# stuff like deque and custom iterables
|
|
df1 = DataFrame([1, 2, 3])
|
|
df2 = DataFrame([4, 5, 6])
|
|
expected = DataFrame([1, 2, 3, 4, 5, 6])
|
|
assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
|
|
assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
|
|
assert_frame_equal(concat((df for df in (df1, df2)),
|
|
ignore_index=True), expected)
|
|
assert_frame_equal(
|
|
concat(deque((df1, df2)), ignore_index=True), expected)
|
|
|
|
class CustomIterator1(object):
|
|
|
|
def __len__(self):
|
|
return 2
|
|
|
|
def __getitem__(self, index):
|
|
try:
|
|
return {0: df1, 1: df2}[index]
|
|
except KeyError:
|
|
raise IndexError
|
|
assert_frame_equal(pd.concat(CustomIterator1(),
|
|
ignore_index=True), expected)
|
|
|
|
class CustomIterator2(Iterable):
|
|
|
|
def __iter__(self):
|
|
yield df1
|
|
yield df2
|
|
assert_frame_equal(pd.concat(CustomIterator2(),
|
|
ignore_index=True), expected)
|
|
|
|
def test_concat_invalid(self):
|
|
|
|
# trying to concat a ndframe with a non-ndframe
|
|
df1 = mkdf(10, 2)
|
|
for obj in [1, dict(), [1, 2], (1, 2)]:
|
|
pytest.raises(TypeError, lambda x: concat([df1, obj]))
|
|
|
|
def test_concat_invalid_first_argument(self):
|
|
df1 = mkdf(10, 2)
|
|
df2 = mkdf(10, 2)
|
|
pytest.raises(TypeError, concat, df1, df2)
|
|
|
|
# generator ok though
|
|
concat(DataFrame(np.random.rand(5, 5)) for _ in range(3))
|
|
|
|
# text reader ok
|
|
# GH6583
|
|
data = """index,A,B,C,D
|
|
foo,2,3,4,5
|
|
bar,7,8,9,10
|
|
baz,12,13,14,15
|
|
qux,12,13,14,15
|
|
foo2,12,13,14,15
|
|
bar2,12,13,14,15
|
|
"""
|
|
|
|
reader = read_csv(StringIO(data), chunksize=1)
|
|
result = concat(reader, ignore_index=True)
|
|
expected = read_csv(StringIO(data))
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_concat_NaT_series(self):
|
|
# GH 11693
|
|
# test for merging NaT series with datetime series.
|
|
x = Series(date_range('20151124 08:00', '20151124 09:00',
|
|
freq='1h', tz='US/Eastern'))
|
|
y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
|
|
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
|
|
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# all NaT with tz
|
|
expected = Series(pd.NaT, index=range(4),
|
|
dtype='datetime64[ns, US/Eastern]')
|
|
result = pd.concat([y, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# without tz
|
|
x = pd.Series(pd.date_range('20151124 08:00',
|
|
'20151124 09:00', freq='1h'))
|
|
y = pd.Series(pd.date_range('20151124 10:00',
|
|
'20151124 11:00', freq='1h'))
|
|
y[:] = pd.NaT
|
|
expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT])
|
|
result = pd.concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# all NaT without tz
|
|
x[:] = pd.NaT
|
|
expected = pd.Series(pd.NaT, index=range(4),
|
|
dtype='datetime64[ns]')
|
|
result = pd.concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_concat_tz_frame(self):
|
|
df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'),
|
|
B=pd.Timestamp('20130603', tz='CET')),
|
|
index=range(5))
|
|
|
|
# concat
|
|
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
|
|
assert_frame_equal(df2, df3)
|
|
|
|
def test_concat_tz_series(self):
|
|
# gh-11755: tz and no tz
|
|
x = Series(date_range('20151124 08:00',
|
|
'20151124 09:00',
|
|
freq='1h', tz='UTC'))
|
|
y = Series(date_range('2012-01-01', '2012-01-02'))
|
|
expected = Series([x[0], x[1], y[0], y[1]],
|
|
dtype='object')
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# gh-11887: concat tz and object
|
|
x = Series(date_range('20151124 08:00',
|
|
'20151124 09:00',
|
|
freq='1h', tz='UTC'))
|
|
y = Series(['a', 'b'])
|
|
expected = Series([x[0], x[1], y[0], y[1]],
|
|
dtype='object')
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# see gh-12217 and gh-12306
|
|
# Concatenating two UTC times
|
|
first = pd.DataFrame([[datetime(2016, 1, 1)]])
|
|
first[0] = first[0].dt.tz_localize('UTC')
|
|
|
|
second = pd.DataFrame([[datetime(2016, 1, 2)]])
|
|
second[0] = second[0].dt.tz_localize('UTC')
|
|
|
|
result = pd.concat([first, second])
|
|
assert result[0].dtype == 'datetime64[ns, UTC]'
|
|
|
|
# Concatenating two London times
|
|
first = pd.DataFrame([[datetime(2016, 1, 1)]])
|
|
first[0] = first[0].dt.tz_localize('Europe/London')
|
|
|
|
second = pd.DataFrame([[datetime(2016, 1, 2)]])
|
|
second[0] = second[0].dt.tz_localize('Europe/London')
|
|
|
|
result = pd.concat([first, second])
|
|
assert result[0].dtype == 'datetime64[ns, Europe/London]'
|
|
|
|
# Concatenating 2+1 London times
|
|
first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]])
|
|
first[0] = first[0].dt.tz_localize('Europe/London')
|
|
|
|
second = pd.DataFrame([[datetime(2016, 1, 3)]])
|
|
second[0] = second[0].dt.tz_localize('Europe/London')
|
|
|
|
result = pd.concat([first, second])
|
|
assert result[0].dtype == 'datetime64[ns, Europe/London]'
|
|
|
|
# Concat'ing 1+2 London times
|
|
first = pd.DataFrame([[datetime(2016, 1, 1)]])
|
|
first[0] = first[0].dt.tz_localize('Europe/London')
|
|
|
|
second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]])
|
|
second[0] = second[0].dt.tz_localize('Europe/London')
|
|
|
|
result = pd.concat([first, second])
|
|
assert result[0].dtype == 'datetime64[ns, Europe/London]'
|
|
|
|
def test_concat_tz_series_with_datetimelike(self):
|
|
# see gh-12620: tz and timedelta
|
|
x = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
|
pd.Timestamp('2011-02-01', tz='US/Eastern')]
|
|
y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')]
|
|
result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
|
|
tm.assert_series_equal(result, pd.Series(x + y, dtype='object'))
|
|
|
|
# tz and period
|
|
y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')]
|
|
result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
|
|
tm.assert_series_equal(result, pd.Series(x + y, dtype='object'))
|
|
|
|
def test_concat_tz_series_tzlocal(self):
|
|
# see gh-13583
|
|
x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()),
|
|
pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())]
|
|
y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()),
|
|
pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())]
|
|
|
|
result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
|
|
tm.assert_series_equal(result, pd.Series(x + y))
|
|
assert result.dtype == 'datetime64[ns, tzlocal()]'
|
|
|
|
@pytest.mark.parametrize('tz1', [None, 'UTC'])
|
|
@pytest.mark.parametrize('tz2', [None, 'UTC'])
|
|
@pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')])
|
|
def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s):
|
|
# GH 12396
|
|
|
|
# tz-naive
|
|
first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply(
|
|
lambda x: x.dt.tz_localize(tz1))
|
|
second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2))
|
|
|
|
result = pd.concat([first, second], axis=0)
|
|
expected = pd.DataFrame(pd.Series(
|
|
[pd.NaT, pd.NaT, s], index=[0, 1, 0]))
|
|
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
|
|
if tz1 != tz2:
|
|
expected = expected.astype(object)
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('tz1', [None, 'UTC'])
|
|
@pytest.mark.parametrize('tz2', [None, 'UTC'])
|
|
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
|
|
# GH 12396
|
|
|
|
first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
|
|
second = pd.DataFrame(pd.Series(
|
|
[pd.NaT]).dt.tz_localize(tz2), columns=[1])
|
|
expected = pd.DataFrame(
|
|
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
|
|
1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)}
|
|
)
|
|
result = pd.concat([first, second], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('tz1', [None, 'UTC'])
|
|
@pytest.mark.parametrize('tz2', [None, 'UTC'])
|
|
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
|
|
# GH 12396
|
|
|
|
# tz-naive
|
|
first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
|
|
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)],
|
|
[pd.Timestamp('2016/01/01', tz=tz2)]],
|
|
index=[2, 3])
|
|
|
|
expected = pd.DataFrame([pd.NaT, pd.NaT,
|
|
pd.Timestamp('2015/01/01', tz=tz2),
|
|
pd.Timestamp('2016/01/01', tz=tz2)])
|
|
if tz1 != tz2:
|
|
expected = expected.astype(object)
|
|
|
|
result = pd.concat([first, second])
|
|
assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('tz', [None, 'UTC'])
|
|
def test_concat_NaT_dataframes(self, tz):
|
|
# GH 12396
|
|
|
|
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
|
|
first = first.apply(lambda x: x.dt.tz_localize(tz))
|
|
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)],
|
|
[pd.Timestamp('2016/01/01', tz=tz)]],
|
|
index=[2, 3])
|
|
expected = pd.DataFrame([pd.NaT, pd.NaT,
|
|
pd.Timestamp('2015/01/01', tz=tz),
|
|
pd.Timestamp('2016/01/01', tz=tz)])
|
|
|
|
result = pd.concat([first, second], axis=0)
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_concat_period_series(self):
|
|
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
|
|
y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D'))
|
|
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
assert result.dtype == 'object'
|
|
|
|
# different freq
|
|
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
|
|
y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M'))
|
|
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
assert result.dtype == 'object'
|
|
|
|
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
|
|
y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M'))
|
|
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
assert result.dtype == 'object'
|
|
|
|
# non-period
|
|
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
|
|
y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01']))
|
|
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
assert result.dtype == 'object'
|
|
|
|
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
|
|
y = Series(['A', 'B'])
|
|
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
|
|
result = concat([x, y], ignore_index=True)
|
|
tm.assert_series_equal(result, expected)
|
|
assert result.dtype == 'object'
|
|
|
|
def test_concat_empty_series(self):
|
|
# GH 11082
|
|
s1 = pd.Series([1, 2, 3], name='x')
|
|
s2 = pd.Series(name='y')
|
|
res = pd.concat([s1, s2], axis=1)
|
|
exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]})
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
s1 = pd.Series([1, 2, 3], name='x')
|
|
s2 = pd.Series(name='y')
|
|
res = pd.concat([s1, s2], axis=0)
|
|
# name will be reset
|
|
exp = pd.Series([1, 2, 3])
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
# empty Series with no name
|
|
s1 = pd.Series([1, 2, 3], name='x')
|
|
s2 = pd.Series(name=None)
|
|
res = pd.concat([s1, s2], axis=1)
|
|
exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
|
|
columns=['x', 0])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
@pytest.mark.parametrize('tz', [None, 'UTC'])
|
|
@pytest.mark.parametrize('values', [[], [1, 2, 3]])
|
|
def test_concat_empty_series_timelike(self, tz, values):
|
|
# GH 18447
|
|
|
|
first = Series([], dtype='M8[ns]').dt.tz_localize(tz)
|
|
second = Series(values)
|
|
expected = DataFrame(
|
|
{0: pd.Series([pd.NaT] * len(values),
|
|
dtype='M8[ns]'
|
|
).dt.tz_localize(tz),
|
|
1: values})
|
|
result = concat([first, second], axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_default_index(self):
|
|
# is_series and ignore_index
|
|
s1 = pd.Series([1, 2, 3], name='x')
|
|
s2 = pd.Series([4, 5, 6], name='y')
|
|
res = pd.concat([s1, s2], axis=1, ignore_index=True)
|
|
assert isinstance(res.columns, pd.RangeIndex)
|
|
exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]])
|
|
# use check_index_type=True to check the result have
|
|
# RangeIndex (default index)
|
|
tm.assert_frame_equal(res, exp, check_index_type=True,
|
|
check_column_type=True)
|
|
|
|
# is_series and all inputs have no names
|
|
s1 = pd.Series([1, 2, 3])
|
|
s2 = pd.Series([4, 5, 6])
|
|
res = pd.concat([s1, s2], axis=1, ignore_index=False)
|
|
assert isinstance(res.columns, pd.RangeIndex)
|
|
exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]])
|
|
exp.columns = pd.RangeIndex(2)
|
|
tm.assert_frame_equal(res, exp, check_index_type=True,
|
|
check_column_type=True)
|
|
|
|
# is_dataframe and ignore_index
|
|
df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]})
|
|
df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]})
|
|
|
|
res = pd.concat([df1, df2], axis=0, ignore_index=True)
|
|
exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]],
|
|
columns=['A', 'B'])
|
|
tm.assert_frame_equal(res, exp, check_index_type=True,
|
|
check_column_type=True)
|
|
|
|
res = pd.concat([df1, df2], axis=1, ignore_index=True)
|
|
exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
|
|
tm.assert_frame_equal(res, exp, check_index_type=True,
|
|
check_column_type=True)
|
|
|
|
def test_concat_multiindex_rangeindex(self):
|
|
# GH13542
|
|
# when multi-index levels are RangeIndex objects
|
|
# there is a bug in concat with objects of len 1
|
|
|
|
df = DataFrame(np.random.randn(9, 2))
|
|
df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
|
|
labels=[np.repeat(np.arange(3), 3),
|
|
np.tile(np.arange(3), 3)])
|
|
|
|
res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
|
|
exp = df.iloc[[2, 3, 4, 5], :]
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_concat_multiindex_dfs_with_deepcopy(self):
|
|
# GH 9967
|
|
from copy import deepcopy
|
|
example_multiindex1 = pd.MultiIndex.from_product([['a'], ['b']])
|
|
example_dataframe1 = pd.DataFrame([0], index=example_multiindex1)
|
|
|
|
example_multiindex2 = pd.MultiIndex.from_product([['a'], ['c']])
|
|
example_dataframe2 = pd.DataFrame([1], index=example_multiindex2)
|
|
|
|
example_dict = {'s1': example_dataframe1, 's2': example_dataframe2}
|
|
expected_index = pd.MultiIndex(levels=[['s1', 's2'],
|
|
['a'],
|
|
['b', 'c']],
|
|
labels=[[0, 1], [0, 0], [0, 1]],
|
|
names=['testname', None, None])
|
|
expected = pd.DataFrame([[0], [1]], index=expected_index)
|
|
result_copy = pd.concat(deepcopy(example_dict), names=['testname'])
|
|
tm.assert_frame_equal(result_copy, expected)
|
|
result_no_copy = pd.concat(example_dict, names=['testname'])
|
|
tm.assert_frame_equal(result_no_copy, expected)
|
|
|
|
def test_categorical_concat_append(self):
|
|
cat = Categorical(["a", "b"], categories=["a", "b"])
|
|
vals = [1, 2]
|
|
df = DataFrame({"cats": cat, "vals": vals})
|
|
cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
|
|
vals2 = [1, 2, 1, 2]
|
|
exp = DataFrame({"cats": cat2, "vals": vals2},
|
|
index=Index([0, 1, 0, 1]))
|
|
|
|
tm.assert_frame_equal(pd.concat([df, df]), exp)
|
|
tm.assert_frame_equal(df.append(df), exp)
|
|
|
|
# GH 13524 can concat different categories
|
|
cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
|
|
vals3 = [1, 2]
|
|
df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
|
|
|
|
res = pd.concat([df, df_different_categories], ignore_index=True)
|
|
exp = DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]})
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
res = df.append(df_different_categories, ignore_index=True)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_categorical_concat_dtypes(self):
|
|
|
|
# GH8143
|
|
index = ['cat', 'obj', 'num']
|
|
cat = Categorical(['a', 'b', 'c'])
|
|
obj = Series(['a', 'b', 'c'])
|
|
num = Series([1, 2, 3])
|
|
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
|
|
|
|
result = df.dtypes == 'object'
|
|
expected = Series([False, True, False], index=index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.dtypes == 'int64'
|
|
expected = Series([False, False, True], index=index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.dtypes == 'category'
|
|
expected = Series([True, False, False], index=index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_categorical_concat(self, sort):
|
|
# See GH 10177
|
|
df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3),
|
|
columns=["a", "b", "c"])
|
|
|
|
df2 = DataFrame(np.arange(14, dtype='int64').reshape(7, 2),
|
|
columns=["a", "c"])
|
|
|
|
cat_values = ["one", "one", "two", "one", "two", "two", "one"]
|
|
df2['h'] = Series(Categorical(cat_values))
|
|
|
|
res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
|
|
exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
|
|
'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan,
|
|
np.nan, np.nan, np.nan, np.nan],
|
|
'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
|
|
'h': [None] * 6 + cat_values})
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_categorical_concat_gh7864(self):
|
|
# GH 7864
|
|
# make sure ordering is preserverd
|
|
df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')})
|
|
df["grade"] = Categorical(df["raw_grade"])
|
|
df['grade'].cat.set_categories(['e', 'a', 'b'])
|
|
|
|
df1 = df[0:3]
|
|
df2 = df[3:]
|
|
|
|
tm.assert_index_equal(df['grade'].cat.categories,
|
|
df1['grade'].cat.categories)
|
|
tm.assert_index_equal(df['grade'].cat.categories,
|
|
df2['grade'].cat.categories)
|
|
|
|
dfx = pd.concat([df1, df2])
|
|
tm.assert_index_equal(df['grade'].cat.categories,
|
|
dfx['grade'].cat.categories)
|
|
|
|
dfa = df1.append(df2)
|
|
tm.assert_index_equal(df['grade'].cat.categories,
|
|
dfa['grade'].cat.categories)
|
|
|
|
def test_categorical_concat_preserve(self):
|
|
|
|
# GH 8641 series concat not preserving category dtype
|
|
# GH 13524 can concat different categories
|
|
s = Series(list('abc'), dtype='category')
|
|
s2 = Series(list('abd'), dtype='category')
|
|
|
|
exp = Series(list('abcabd'))
|
|
res = pd.concat([s, s2], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
exp = Series(list('abcabc'), dtype='category')
|
|
res = pd.concat([s, s], ignore_index=True)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2],
|
|
dtype='category')
|
|
res = pd.concat([s, s])
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
a = Series(np.arange(6, dtype='int64'))
|
|
b = Series(list('aabbca'))
|
|
|
|
df2 = DataFrame({'A': a,
|
|
'B': b.astype(CategoricalDtype(list('cab')))})
|
|
res = pd.concat([df2, df2])
|
|
exp = DataFrame(
|
|
{'A': pd.concat([a, a]),
|
|
'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))})
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_categorical_index_preserver(self):
|
|
|
|
a = Series(np.arange(6, dtype='int64'))
|
|
b = Series(list('aabbca'))
|
|
|
|
df2 = DataFrame({'A': a,
|
|
'B': b.astype(CategoricalDtype(list('cab')))
|
|
}).set_index('B')
|
|
result = pd.concat([df2, df2])
|
|
expected = DataFrame(
|
|
{'A': pd.concat([a, a]),
|
|
'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))
|
|
}).set_index('B')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# wrong catgories
|
|
df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe'))
|
|
}).set_index('B')
|
|
pytest.raises(TypeError, lambda: pd.concat([df2, df3]))
|
|
|
|
def test_concat_categoricalindex(self):
|
|
# GH 16111, categories that aren't lexsorted
|
|
categories = [9, 0, 1, 2, 3]
|
|
|
|
a = pd.Series(1, index=pd.CategoricalIndex([9, 0],
|
|
categories=categories))
|
|
b = pd.Series(2, index=pd.CategoricalIndex([0, 1],
|
|
categories=categories))
|
|
c = pd.Series(3, index=pd.CategoricalIndex([1, 2],
|
|
categories=categories))
|
|
|
|
result = pd.concat([a, b, c], axis=1)
|
|
|
|
exp_idx = pd.CategoricalIndex([0, 1, 2, 9])
|
|
exp = pd.DataFrame({0: [1, np.nan, np.nan, 1],
|
|
1: [2, 2, np.nan, np.nan],
|
|
2: [np.nan, 3, 3, np.nan]},
|
|
columns=[0, 1, 2],
|
|
index=exp_idx)
|
|
tm.assert_frame_equal(result, exp)
|
|
|
|
def test_concat_order(self):
|
|
# GH 17344
|
|
dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])]
|
|
dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a'])
|
|
for i in range(100)]
|
|
|
|
result = pd.concat(dfs, sort=True).columns
|
|
|
|
if PY2:
|
|
# Different sort order between incomparable objects between
|
|
# python 2 and python3 via Index.union.
|
|
expected = dfs[1].columns
|
|
else:
|
|
expected = dfs[0].columns
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
def test_concat_datetime_timezone(self):
|
|
# GH 18523
|
|
idx1 = pd.date_range('2011-01-01', periods=3, freq='H',
|
|
tz='Europe/Paris')
|
|
idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H')
|
|
df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1)
|
|
df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2)
|
|
result = pd.concat([df1, df2], axis=1)
|
|
|
|
exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00',
|
|
'2011-01-01 01:00:00+01:00',
|
|
'2011-01-01 02:00:00+01:00'],
|
|
freq='H'
|
|
).tz_localize('UTC').tz_convert('Europe/Paris')
|
|
|
|
expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]],
|
|
index=exp_idx, columns=['a', 'b'])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
idx3 = pd.date_range('2011-01-01', periods=3,
|
|
freq='H', tz='Asia/Tokyo')
|
|
df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3)
|
|
result = pd.concat([df1, df3], axis=1)
|
|
|
|
exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00',
|
|
'2010-12-31 16:00:00+00:00',
|
|
'2010-12-31 17:00:00+00:00',
|
|
'2010-12-31 23:00:00+00:00',
|
|
'2011-01-01 00:00:00+00:00',
|
|
'2011-01-01 01:00:00+00:00']
|
|
).tz_localize('UTC')
|
|
|
|
expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3],
|
|
[1, np.nan], [2, np.nan], [3, np.nan]],
|
|
index=exp_idx, columns=['a', 'b'])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
|
|
@pytest.mark.parametrize('dt', np.sctypes['float'])
|
|
def test_concat_no_unnecessary_upcast(dt, pdt):
|
|
with catch_warnings(record=True):
|
|
# GH 13247
|
|
dims = pdt().ndim
|
|
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
|
|
pdt(np.array([np.nan], dtype=dt, ndmin=dims)),
|
|
pdt(np.array([5], dtype=dt, ndmin=dims))]
|
|
x = pd.concat(dfs)
|
|
assert x.values.dtype == dt
|
|
|
|
|
|
@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
|
|
@pytest.mark.parametrize('dt', np.sctypes['int'])
|
|
def test_concat_will_upcast(dt, pdt):
|
|
with catch_warnings(record=True):
|
|
dims = pdt().ndim
|
|
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
|
|
pdt(np.array([np.nan], ndmin=dims)),
|
|
pdt(np.array([5], dtype=dt, ndmin=dims))]
|
|
x = pd.concat(dfs)
|
|
assert x.values.dtype == 'float64'
|
|
|
|
|
|
def test_concat_empty_and_non_empty_frame_regression():
|
|
# GH 18178 regression test
|
|
df1 = pd.DataFrame({'foo': [1]})
|
|
df2 = pd.DataFrame({'foo': []})
|
|
expected = pd.DataFrame({'foo': [1.0]})
|
|
result = pd.concat([df1, df2])
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_concat_empty_and_non_empty_series_regression():
|
|
# GH 18187 regression test
|
|
s1 = pd.Series([1])
|
|
s2 = pd.Series([])
|
|
expected = s1
|
|
result = pd.concat([s1, s2])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_concat_sorts_columns(sort_with_none):
|
|
# GH-4588
|
|
df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
|
|
df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]})
|
|
|
|
# for sort=True/None
|
|
expected = pd.DataFrame({"a": [1, 2, 3, 4],
|
|
"b": [1, 2, None, None],
|
|
"c": [None, None, 5, 6]},
|
|
columns=['a', 'b', 'c'])
|
|
|
|
if sort_with_none is False:
|
|
expected = expected[['b', 'a', 'c']]
|
|
|
|
if sort_with_none is None:
|
|
# only warn if not explicitly specified
|
|
ctx = tm.assert_produces_warning(FutureWarning)
|
|
else:
|
|
ctx = tm.assert_produces_warning(None)
|
|
|
|
# default
|
|
with ctx:
|
|
result = pd.concat([df1, df2], ignore_index=True, sort=sort_with_none)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_concat_sorts_index(sort_with_none):
|
|
df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b'])
|
|
df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b'])
|
|
|
|
# For True/None
|
|
expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]},
|
|
index=['a', 'b', 'c'],
|
|
columns=['a', 'b'])
|
|
if sort_with_none is False:
|
|
expected = expected.loc[['c', 'a', 'b']]
|
|
|
|
if sort_with_none is None:
|
|
# only warn if not explicitly specified
|
|
ctx = tm.assert_produces_warning(FutureWarning)
|
|
else:
|
|
ctx = tm.assert_produces_warning(None)
|
|
|
|
# Warn and sort by default
|
|
with ctx:
|
|
result = pd.concat([df1, df2], axis=1, sort=sort_with_none)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_concat_inner_sort(sort_with_none):
|
|
# https://github.com/pandas-dev/pandas/pull/20613
|
|
df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]},
|
|
columns=['b', 'a', 'c'])
|
|
df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4])
|
|
|
|
with tm.assert_produces_warning(None):
|
|
# unset sort should *not* warn for inner join
|
|
# since that never sorted
|
|
result = pd.concat([df1, df2], sort=sort_with_none,
|
|
join='inner',
|
|
ignore_index=True)
|
|
|
|
expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]},
|
|
columns=['b', 'a'])
|
|
if sort_with_none is True:
|
|
expected = expected[['a', 'b']]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_concat_aligned_sort():
|
|
# GH-4588
|
|
df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]},
|
|
columns=['c', 'b', 'a'])
|
|
result = pd.concat([df, df], sort=True, ignore_index=True)
|
|
expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4],
|
|
'c': [1, 2, 1, 2]},
|
|
columns=['a', 'b', 'c'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True,
|
|
ignore_index=True)
|
|
expected = expected[['b', 'c']]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_concat_aligned_sort_does_not_raise():
|
|
# GH-4588
|
|
# We catch TypeErrors from sorting internally and do not re-raise.
|
|
df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a'])
|
|
expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]},
|
|
columns=[1, 'a'])
|
|
result = pd.concat([df, df], ignore_index=True, sort=True)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize("s1name,s2name", [
|
|
(np.int64(190), (43, 0)), (190, (43, 0))])
|
|
def test_concat_series_name_npscalar_tuple(s1name, s2name):
|
|
# GH21015
|
|
s1 = pd.Series({'a': 1, 'b': 2}, name=s1name)
|
|
s2 = pd.Series({'c': 5, 'd': 6}, name=s2name)
|
|
result = pd.concat([s1, s2])
|
|
expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6})
|
|
tm.assert_series_equal(result, expected)
|