laywerrobot/lib/python3.6/site-packages/pandas/tests/reshape/test_concat.py
2020-08-27 21:55:39 +02:00

2500 lines
97 KiB
Python

from warnings import catch_warnings
from itertools import combinations, product
import datetime as dt
import dateutil
import numpy as np
from numpy.random import randn
from datetime import datetime
from pandas.compat import StringIO, iteritems, PY2
import pandas as pd
from pandas import (DataFrame, concat,
read_csv, isna, Series, date_range,
Index, Panel, MultiIndex, Timestamp,
DatetimeIndex, Categorical)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.util import testing as tm
from pandas.util.testing import (assert_frame_equal,
makeCustomDataframe as mkdf)
import pytest
@pytest.fixture(params=[True, False])
def sort(request):
"""Boolean sort keyword for concat and DataFrame.append."""
return request.param
@pytest.fixture(params=[True, False, None])
def sort_with_none(request):
"""Boolean sort keyword for concat and DataFrame.append.
Includes the default of None
"""
# TODO: Replace with sort once keyword changes.
return request.param
class ConcatenateBase(object):
def setup_method(self, method):
self.frame = DataFrame(tm.getSeriesData())
self.mixed_frame = self.frame.copy()
self.mixed_frame['foo'] = 'bar'
class TestConcatAppendCommon(ConcatenateBase):
"""
Test common dtype coercion rules between concat and append.
"""
def setup_method(self, method):
dt_data = [pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03')]
tz_data = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.Timestamp('2011-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03', tz='US/Eastern')]
td_data = [pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Timedelta('3 days')]
period_data = [pd.Period('2011-01', freq='M'),
pd.Period('2011-02', freq='M'),
pd.Period('2011-03', freq='M')]
self.data = {'bool': [True, False, True],
'int64': [1, 2, 3],
'float64': [1.1, np.nan, 3.3],
'category': pd.Categorical(['X', 'Y', 'Z']),
'object': ['a', 'b', 'c'],
'datetime64[ns]': dt_data,
'datetime64[ns, US/Eastern]': tz_data,
'timedelta64[ns]': td_data,
'period[M]': period_data}
def _check_expected_dtype(self, obj, label):
"""
Check whether obj has expected dtype depending on label
considering not-supported dtypes
"""
if isinstance(obj, pd.Index):
if label == 'bool':
assert obj.dtype == 'object'
else:
assert obj.dtype == label
elif isinstance(obj, pd.Series):
if label.startswith('period'):
assert obj.dtype == 'object'
else:
assert obj.dtype == label
else:
raise ValueError
def test_dtypes(self):
# to confirm test case covers intended dtypes
for typ, vals in iteritems(self.data):
self._check_expected_dtype(pd.Index(vals), typ)
self._check_expected_dtype(pd.Series(vals), typ)
def test_concatlike_same_dtypes(self):
# GH 13660
for typ1, vals1 in iteritems(self.data):
vals2 = vals1
vals3 = vals1
if typ1 == 'category':
exp_data = pd.Categorical(list(vals1) + list(vals2))
exp_data3 = pd.Categorical(list(vals1) + list(vals2) +
list(vals3))
else:
exp_data = vals1 + vals2
exp_data3 = vals1 + vals2 + vals3
# ----- Index ----- #
# index.append
res = pd.Index(vals1).append(pd.Index(vals2))
exp = pd.Index(exp_data)
tm.assert_index_equal(res, exp)
# 3 elements
res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)])
exp = pd.Index(exp_data3)
tm.assert_index_equal(res, exp)
# index.append name mismatch
i1 = pd.Index(vals1, name='x')
i2 = pd.Index(vals2, name='y')
res = i1.append(i2)
exp = pd.Index(exp_data)
tm.assert_index_equal(res, exp)
# index.append name match
i1 = pd.Index(vals1, name='x')
i2 = pd.Index(vals2, name='x')
res = i1.append(i2)
exp = pd.Index(exp_data, name='x')
tm.assert_index_equal(res, exp)
# cannot append non-index
with tm.assert_raises_regex(TypeError,
'all inputs must be Index'):
pd.Index(vals1).append(vals2)
with tm.assert_raises_regex(TypeError,
'all inputs must be Index'):
pd.Index(vals1).append([pd.Index(vals2), vals3])
# ----- Series ----- #
# series.append
res = pd.Series(vals1).append(pd.Series(vals2),
ignore_index=True)
exp = pd.Series(exp_data)
tm.assert_series_equal(res, exp, check_index_type=True)
# concat
res = pd.concat([pd.Series(vals1), pd.Series(vals2)],
ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# 3 elements
res = pd.Series(vals1).append([pd.Series(vals2), pd.Series(vals3)],
ignore_index=True)
exp = pd.Series(exp_data3)
tm.assert_series_equal(res, exp)
res = pd.concat([pd.Series(vals1), pd.Series(vals2),
pd.Series(vals3)], ignore_index=True)
tm.assert_series_equal(res, exp)
# name mismatch
s1 = pd.Series(vals1, name='x')
s2 = pd.Series(vals2, name='y')
res = s1.append(s2, ignore_index=True)
exp = pd.Series(exp_data)
tm.assert_series_equal(res, exp, check_index_type=True)
res = pd.concat([s1, s2], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# name match
s1 = pd.Series(vals1, name='x')
s2 = pd.Series(vals2, name='x')
res = s1.append(s2, ignore_index=True)
exp = pd.Series(exp_data, name='x')
tm.assert_series_equal(res, exp, check_index_type=True)
res = pd.concat([s1, s2], ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# cannot append non-index
msg = (r'cannot concatenate object of type \"(.+?)\";'
' only pd.Series, pd.DataFrame, and pd.Panel'
r' \(deprecated\) objs are valid')
with tm.assert_raises_regex(TypeError, msg):
pd.Series(vals1).append(vals2)
with tm.assert_raises_regex(TypeError, msg):
pd.Series(vals1).append([pd.Series(vals2), vals3])
with tm.assert_raises_regex(TypeError, msg):
pd.concat([pd.Series(vals1), vals2])
with tm.assert_raises_regex(TypeError, msg):
pd.concat([pd.Series(vals1), pd.Series(vals2), vals3])
def test_concatlike_dtypes_coercion(self):
# GH 13660
for typ1, vals1 in iteritems(self.data):
for typ2, vals2 in iteritems(self.data):
vals3 = vals2
# basically infer
exp_index_dtype = None
exp_series_dtype = None
if typ1 == typ2:
# same dtype is tested in test_concatlike_same_dtypes
continue
elif typ1 == 'category' or typ2 == 'category':
# ToDo: suspicious
continue
# specify expected dtype
if typ1 == 'bool' and typ2 in ('int64', 'float64'):
# series coerces to numeric based on numpy rule
# index doesn't because bool is object dtype
exp_series_dtype = typ2
elif typ2 == 'bool' and typ1 in ('int64', 'float64'):
exp_series_dtype = typ1
elif (typ1 == 'datetime64[ns, US/Eastern]' or
typ2 == 'datetime64[ns, US/Eastern]' or
typ1 == 'timedelta64[ns]' or
typ2 == 'timedelta64[ns]'):
exp_index_dtype = object
exp_series_dtype = object
exp_data = vals1 + vals2
exp_data3 = vals1 + vals2 + vals3
# ----- Index ----- #
# index.append
res = pd.Index(vals1).append(pd.Index(vals2))
exp = pd.Index(exp_data, dtype=exp_index_dtype)
tm.assert_index_equal(res, exp)
# 3 elements
res = pd.Index(vals1).append([pd.Index(vals2),
pd.Index(vals3)])
exp = pd.Index(exp_data3, dtype=exp_index_dtype)
tm.assert_index_equal(res, exp)
# ----- Series ----- #
# series.append
res = pd.Series(vals1).append(pd.Series(vals2),
ignore_index=True)
exp = pd.Series(exp_data, dtype=exp_series_dtype)
tm.assert_series_equal(res, exp, check_index_type=True)
# concat
res = pd.concat([pd.Series(vals1), pd.Series(vals2)],
ignore_index=True)
tm.assert_series_equal(res, exp, check_index_type=True)
# 3 elements
res = pd.Series(vals1).append([pd.Series(vals2),
pd.Series(vals3)],
ignore_index=True)
exp = pd.Series(exp_data3, dtype=exp_series_dtype)
tm.assert_series_equal(res, exp)
res = pd.concat([pd.Series(vals1), pd.Series(vals2),
pd.Series(vals3)], ignore_index=True)
tm.assert_series_equal(res, exp)
def test_concatlike_common_coerce_to_pandas_object(self):
# GH 13626
# result must be Timestamp/Timedelta, not datetime.datetime/timedelta
dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02'])
tdi = pd.TimedeltaIndex(['1 days', '2 days'])
exp = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timedelta('1 days'),
pd.Timedelta('2 days')])
res = dti.append(tdi)
tm.assert_index_equal(res, exp)
assert isinstance(res[0], pd.Timestamp)
assert isinstance(res[-1], pd.Timedelta)
dts = pd.Series(dti)
tds = pd.Series(tdi)
res = dts.append(tds)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
assert isinstance(res.iloc[0], pd.Timestamp)
assert isinstance(res.iloc[-1], pd.Timedelta)
res = pd.concat([dts, tds])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
assert isinstance(res.iloc[0], pd.Timestamp)
assert isinstance(res.iloc[-1], pd.Timedelta)
def test_concatlike_datetimetz(self, tz_aware_fixture):
tz = tz_aware_fixture
# GH 7795
dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz)
dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz=tz)
exp = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
'2012-01-01', '2012-01-02'], tz=tz)
res = dti1.append(dti2)
tm.assert_index_equal(res, exp)
dts1 = pd.Series(dti1)
dts2 = pd.Series(dti2)
res = dts1.append(dts2)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts2])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
@pytest.mark.parametrize('tz',
['UTC', 'US/Eastern', 'Asia/Tokyo', 'EST5EDT'])
def test_concatlike_datetimetz_short(self, tz):
# GH 7795
ix1 = pd.DatetimeIndex(start='2014-07-15', end='2014-07-17',
freq='D', tz=tz)
ix2 = pd.DatetimeIndex(['2014-07-11', '2014-07-21'], tz=tz)
df1 = pd.DataFrame(0, index=ix1, columns=['A', 'B'])
df2 = pd.DataFrame(0, index=ix2, columns=['A', 'B'])
exp_idx = pd.DatetimeIndex(['2014-07-15', '2014-07-16',
'2014-07-17', '2014-07-11',
'2014-07-21'], tz=tz)
exp = pd.DataFrame(0, index=exp_idx, columns=['A', 'B'])
tm.assert_frame_equal(df1.append(df2), exp)
tm.assert_frame_equal(pd.concat([df1, df2]), exp)
def test_concatlike_datetimetz_to_object(self, tz_aware_fixture):
tz = tz_aware_fixture
# GH 13660
# different tz coerces to object
dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz)
dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'])
exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz),
pd.Timestamp('2011-01-02', tz=tz),
pd.Timestamp('2012-01-01'),
pd.Timestamp('2012-01-02')], dtype=object)
res = dti1.append(dti2)
tm.assert_index_equal(res, exp)
dts1 = pd.Series(dti1)
dts2 = pd.Series(dti2)
res = dts1.append(dts2)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts2])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
# different tz
dti3 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'],
tz='US/Pacific')
exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz),
pd.Timestamp('2011-01-02', tz=tz),
pd.Timestamp('2012-01-01', tz='US/Pacific'),
pd.Timestamp('2012-01-02', tz='US/Pacific')],
dtype=object)
res = dti1.append(dti3)
# tm.assert_index_equal(res, exp)
dts1 = pd.Series(dti1)
dts3 = pd.Series(dti3)
res = dts1.append(dts3)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([dts1, dts3])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period(self):
# GH 13660
pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
pi2 = pd.PeriodIndex(['2012-01', '2012-02'], freq='M')
exp = pd.PeriodIndex(['2011-01', '2011-02', '2012-01',
'2012-02'], freq='M')
res = pi1.append(pi2)
tm.assert_index_equal(res, exp)
ps1 = pd.Series(pi1)
ps2 = pd.Series(pi2)
res = ps1.append(ps2)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, ps2])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period_diff_freq_to_object(self):
# GH 13221
pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
pi2 = pd.PeriodIndex(['2012-01-01', '2012-02-01'], freq='D')
exp = pd.Index([pd.Period('2011-01', freq='M'),
pd.Period('2011-02', freq='M'),
pd.Period('2012-01-01', freq='D'),
pd.Period('2012-02-01', freq='D')], dtype=object)
res = pi1.append(pi2)
tm.assert_index_equal(res, exp)
ps1 = pd.Series(pi1)
ps2 = pd.Series(pi2)
res = ps1.append(ps2)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, ps2])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
def test_concatlike_common_period_mixed_dt_to_object(self):
# GH 13221
# different datetimelike
pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M')
tdi = pd.TimedeltaIndex(['1 days', '2 days'])
exp = pd.Index([pd.Period('2011-01', freq='M'),
pd.Period('2011-02', freq='M'),
pd.Timedelta('1 days'),
pd.Timedelta('2 days')], dtype=object)
res = pi1.append(tdi)
tm.assert_index_equal(res, exp)
ps1 = pd.Series(pi1)
tds = pd.Series(tdi)
res = ps1.append(tds)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([ps1, tds])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
# inverse
exp = pd.Index([pd.Timedelta('1 days'),
pd.Timedelta('2 days'),
pd.Period('2011-01', freq='M'),
pd.Period('2011-02', freq='M')], dtype=object)
res = tdi.append(pi1)
tm.assert_index_equal(res, exp)
ps1 = pd.Series(pi1)
tds = pd.Series(tdi)
res = tds.append(ps1)
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
res = pd.concat([tds, ps1])
tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1]))
def test_concat_categorical(self):
# GH 13524
# same categories -> category
s1 = pd.Series([1, 2, np.nan], dtype='category')
s2 = pd.Series([2, 1, 2], dtype='category')
exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category')
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
# partially different categories => not-category
s1 = pd.Series([3, 2], dtype='category')
s2 = pd.Series([2, 1], dtype='category')
exp = pd.Series([3, 2, 2, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
# completely different categories (same dtype) => not-category
s1 = pd.Series([10, 11, np.nan], dtype='category')
s2 = pd.Series([np.nan, 1, 3, 2], dtype='category')
exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
def test_union_categorical_same_categories_different_order(self):
# https://github.com/pandas-dev/pandas/issues/19096
a = pd.Series(Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']))
b = pd.Series(Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']))
result = pd.concat([a, b], ignore_index=True)
expected = pd.Series(Categorical(['a', 'b', 'c', 'a', 'b', 'c'],
categories=['a', 'b', 'c']))
tm.assert_series_equal(result, expected)
def test_concat_categorical_coercion(self):
# GH 13524
# category + not-category => not-category
s1 = pd.Series([1, 2, np.nan], dtype='category')
s2 = pd.Series([2, 1, 2])
exp = pd.Series([1, 2, np.nan, 2, 1, 2])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
# result shouldn't be affected by 1st elem dtype
exp = pd.Series([2, 1, 2, 1, 2, np.nan])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
# all values are not in category => not-category
s1 = pd.Series([3, 2], dtype='category')
s2 = pd.Series([2, 1])
exp = pd.Series([3, 2, 2, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
exp = pd.Series([2, 1, 3, 2])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
# completely different categories => not-category
s1 = pd.Series([10, 11, np.nan], dtype='category')
s2 = pd.Series([1, 3, 2])
exp = pd.Series([10, 11, np.nan, 1, 3, 2])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
exp = pd.Series([1, 3, 2, 10, 11, np.nan])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
# different dtype => not-category
s1 = pd.Series([10, 11, np.nan], dtype='category')
s2 = pd.Series(['a', 'b', 'c'])
exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c'])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
# if normal series only contains NaN-likes => not-category
s1 = pd.Series([10, 11], dtype='category')
s2 = pd.Series([np.nan, np.nan, np.nan])
exp = pd.Series([10, 11, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
exp = pd.Series([np.nan, np.nan, np.nan, 10, 11])
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
def test_concat_categorical_3elem_coercion(self):
# GH 13524
# mixed dtypes => not-category
s1 = pd.Series([1, 2, np.nan], dtype='category')
s2 = pd.Series([2, 1, 2], dtype='category')
s3 = pd.Series([1, 2, 1, 2, np.nan])
exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan])
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2])
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
# values are all in either category => not-category
s1 = pd.Series([4, 5, 6], dtype='category')
s2 = pd.Series([1, 2, 3], dtype='category')
s3 = pd.Series([1, 3, 4])
exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4])
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3])
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
# values are all in either category => not-category
s1 = pd.Series([4, 5, 6], dtype='category')
s2 = pd.Series([1, 2, 3], dtype='category')
s3 = pd.Series([10, 11, 12])
exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12])
tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp)
tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp)
exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3])
tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp)
def test_concat_categorical_multi_coercion(self):
# GH 13524
s1 = pd.Series([1, 3], dtype='category')
s2 = pd.Series([3, 4], dtype='category')
s3 = pd.Series([2, 3])
s4 = pd.Series([2, 2], dtype='category')
s5 = pd.Series([1, np.nan])
s6 = pd.Series([1, 3, 2], dtype='category')
# mixed dtype, values are all in categories => not-category
exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2])
res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True)
tm.assert_series_equal(res, exp)
res = s1.append([s2, s3, s4, s5, s6], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3])
res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True)
tm.assert_series_equal(res, exp)
res = s6.append([s5, s4, s3, s2, s1], ignore_index=True)
tm.assert_series_equal(res, exp)
def test_concat_categorical_ordered(self):
# GH 13524
s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True))
s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True))
exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True))
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan],
ordered=True))
tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp)
def test_concat_categorical_coercion_nan(self):
# GH 13524
# some edge cases
# category + not-category => not category
s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64),
dtype='category')
s2 = pd.Series([np.nan, 1])
exp = pd.Series([np.nan, np.nan, np.nan, 1])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
s1 = pd.Series([1, np.nan], dtype='category')
s2 = pd.Series([np.nan, np.nan])
exp = pd.Series([1, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
# mixed dtype, all nan-likes => not-category
s1 = pd.Series([np.nan, np.nan], dtype='category')
s2 = pd.Series([np.nan, np.nan])
exp = pd.Series([np.nan, np.nan, np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
# all category nan-likes => category
s1 = pd.Series([np.nan, np.nan], dtype='category')
s2 = pd.Series([np.nan, np.nan], dtype='category')
exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category')
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
def test_concat_categorical_empty(self):
# GH 13524
s1 = pd.Series([], dtype='category')
s2 = pd.Series([1, 2], dtype='category')
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2.append(s1, ignore_index=True), s2)
s1 = pd.Series([], dtype='category')
s2 = pd.Series([], dtype='category')
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
s1 = pd.Series([], dtype='category')
s2 = pd.Series([], dtype='object')
# different dtype => not-category
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2)
tm.assert_series_equal(s1.append(s2, ignore_index=True), s2)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2)
tm.assert_series_equal(s2.append(s1, ignore_index=True), s2)
s1 = pd.Series([], dtype='category')
s2 = pd.Series([np.nan, np.nan])
# empty Series is ignored
exp = pd.Series([np.nan, np.nan])
tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp)
tm.assert_series_equal(s1.append(s2, ignore_index=True), exp)
tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp)
tm.assert_series_equal(s2.append(s1, ignore_index=True), exp)
class TestAppend(ConcatenateBase):
def test_append(self, sort):
begin_index = self.frame.index[:5]
end_index = self.frame.index[5:]
begin_frame = self.frame.reindex(begin_index)
end_frame = self.frame.reindex(end_index)
appended = begin_frame.append(end_frame)
tm.assert_almost_equal(appended['A'], self.frame['A'])
del end_frame['A']
partial_appended = begin_frame.append(end_frame, sort=sort)
assert 'A' in partial_appended
partial_appended = end_frame.append(begin_frame, sort=sort)
assert 'A' in partial_appended
# mixed type handling
appended = self.mixed_frame[:5].append(self.mixed_frame[5:])
tm.assert_frame_equal(appended, self.mixed_frame)
# what to test here
mixed_appended = self.mixed_frame[:5].append(self.frame[5:], sort=sort)
mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:],
sort=sort)
# all equal except 'foo' column
tm.assert_frame_equal(
mixed_appended.reindex(columns=['A', 'B', 'C', 'D']),
mixed_appended2.reindex(columns=['A', 'B', 'C', 'D']))
# append empty
empty = DataFrame({})
appended = self.frame.append(empty)
tm.assert_frame_equal(self.frame, appended)
assert appended is not self.frame
appended = empty.append(self.frame)
tm.assert_frame_equal(self.frame, appended)
assert appended is not self.frame
# Overlap
with pytest.raises(ValueError):
self.frame.append(self.frame, verify_integrity=True)
# see gh-6129: new columns
df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}})
row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z')
expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': {
'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}})
result = df.append(row)
tm.assert_frame_equal(result, expected)
def test_append_length0_frame(self, sort):
df = DataFrame(columns=['A', 'B', 'C'])
df3 = DataFrame(index=[0, 1], columns=['A', 'B'])
df5 = df.append(df3, sort=sort)
expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C'])
assert_frame_equal(df5, expected)
def test_append_records(self):
arr1 = np.zeros((2,), dtype=('i4,f4,a10'))
arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")]
arr2 = np.zeros((3,), dtype=('i4,f4,a10'))
arr2[:] = [(3, 4., 'foo'),
(5, 6., "bar"),
(7., 8., 'baz')]
df1 = DataFrame(arr1)
df2 = DataFrame(arr2)
result = df1.append(df2, ignore_index=True)
expected = DataFrame(np.concatenate((arr1, arr2)))
assert_frame_equal(result, expected)
# rewrite sort fixture, since we also want to test default of None
def test_append_sorts(self, sort_with_none):
df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
df2 = pd.DataFrame({"a": [1, 2], 'c': [3, 4]}, index=[2, 3])
if sort_with_none is None:
# only warn if not explicitly specified
# don't check stacklevel since its set for concat, and append
# has an extra stack.
ctx = tm.assert_produces_warning(FutureWarning,
check_stacklevel=False)
else:
ctx = tm.assert_produces_warning(None)
with ctx:
result = df1.append(df2, sort=sort_with_none)
# for None / True
expected = pd.DataFrame({"b": [1, 2, None, None],
"a": [1, 2, 1, 2],
"c": [None, None, 3, 4]},
columns=['a', 'b', 'c'])
if sort_with_none is False:
expected = expected[['b', 'a', 'c']]
tm.assert_frame_equal(result, expected)
def test_append_different_columns(self, sort):
df = DataFrame({'bools': np.random.randn(10) > 0,
'ints': np.random.randint(0, 10, 10),
'floats': np.random.randn(10),
'strings': ['foo', 'bar'] * 5})
a = df[:5].loc[:, ['bools', 'ints', 'floats']]
b = df[5:].loc[:, ['strings', 'ints', 'floats']]
appended = a.append(b, sort=sort)
assert isna(appended['strings'][0:4]).all()
assert isna(appended['bools'][5:]).all()
def test_append_many(self, sort):
chunks = [self.frame[:5], self.frame[5:10],
self.frame[10:15], self.frame[15:]]
result = chunks[0].append(chunks[1:])
tm.assert_frame_equal(result, self.frame)
chunks[-1] = chunks[-1].copy()
chunks[-1]['foo'] = 'bar'
result = chunks[0].append(chunks[1:], sort=sort)
tm.assert_frame_equal(result.loc[:, self.frame.columns], self.frame)
assert (result['foo'][15:] == 'bar').all()
assert result['foo'][:15].isna().all()
def test_append_preserve_index_name(self):
# #980
df1 = DataFrame(data=None, columns=['A', 'B', 'C'])
df1 = df1.set_index(['A'])
df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]],
columns=['A', 'B', 'C'])
df2 = df2.set_index(['A'])
result = df1.append(df2)
assert result.index.name == 'A'
indexes_can_append = [
pd.RangeIndex(3),
pd.Index([4, 5, 6]),
pd.Index([4.5, 5.5, 6.5]),
pd.Index(list('abc')),
pd.CategoricalIndex('A B C'.split()),
pd.CategoricalIndex('D E F'.split(), ordered=True),
pd.DatetimeIndex([dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 3, 6, 10),
dt.datetime(2013, 1, 3, 7, 12)]),
]
indexes_cannot_append_with_other = [
pd.IntervalIndex.from_breaks([0, 1, 2, 3]),
pd.MultiIndex.from_arrays(['A B C'.split(), 'D E F'.split()]),
]
all_indexes = indexes_can_append + indexes_cannot_append_with_other
@pytest.mark.parametrize("index",
all_indexes,
ids=lambda x: x.__class__.__name__)
def test_append_same_columns_type(self, index):
# GH18359
# df wider than ser
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
ser_index = index[:2]
ser = pd.Series([7, 8], index=ser_index, name=2)
result = df.append(ser)
expected = pd.DataFrame([[1., 2., 3.], [4, 5, 6], [7, 8, np.nan]],
index=[0, 1, 2],
columns=index)
assert_frame_equal(result, expected)
# ser wider than df
ser_index = index
index = index[:2]
df = pd.DataFrame([[1, 2], [4, 5]], columns=index)
ser = pd.Series([7, 8, 9], index=ser_index, name=2)
result = df.append(ser)
expected = pd.DataFrame([[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
index=[0, 1, 2],
columns=ser_index)
assert_frame_equal(result, expected)
@pytest.mark.parametrize("df_columns, series_index",
combinations(indexes_can_append, r=2),
ids=lambda x: x.__class__.__name__)
def test_append_different_columns_types(self, df_columns, series_index):
# GH18359
# See also test 'test_append_different_columns_types_raises' below
# for errors raised when appending
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
ser = pd.Series([7, 8, 9], index=series_index, name=2)
result = df.append(ser)
idx_diff = ser.index.difference(df_columns)
combined_columns = Index(df_columns.tolist()).append(idx_diff)
expected = pd.DataFrame([[1., 2., 3., np.nan, np.nan, np.nan],
[4, 5, 6, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, 7, 8, 9]],
index=[0, 1, 2],
columns=combined_columns)
assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"index_can_append, index_cannot_append_with_other",
product(indexes_can_append, indexes_cannot_append_with_other),
ids=lambda x: x.__class__.__name__)
def test_append_different_columns_types_raises(
self, index_can_append, index_cannot_append_with_other):
# GH18359
# Dataframe.append will raise if IntervalIndex/MultiIndex appends
# or is appended to a different index type
#
# See also test 'test_append_different_columns_types' above for
# appending without raising.
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append)
ser = pd.Series([7, 8, 9], index=index_cannot_append_with_other,
name=2)
with pytest.raises(TypeError):
df.append(ser)
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]],
columns=index_cannot_append_with_other)
ser = pd.Series([7, 8, 9], index=index_can_append, name=2)
with pytest.raises(TypeError):
df.append(ser)
def test_append_dtype_coerce(self, sort):
# GH 4993
# appending with datetime will incorrectly convert datetime64
df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0),
dt.datetime(2013, 1, 2, 0, 0)],
columns=['start_time'])
df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 3, 6, 10)],
[dt.datetime(2013, 1, 4, 0, 0),
dt.datetime(2013, 1, 4, 7, 10)]],
columns=['start_time', 'end_time'])
expected = concat([Series([pd.NaT,
pd.NaT,
dt.datetime(2013, 1, 3, 6, 10),
dt.datetime(2013, 1, 4, 7, 10)],
name='end_time'),
Series([dt.datetime(2013, 1, 1, 0, 0),
dt.datetime(2013, 1, 2, 0, 0),
dt.datetime(2013, 1, 3, 0, 0),
dt.datetime(2013, 1, 4, 0, 0)],
name='start_time')],
axis=1, sort=sort)
result = df1.append(df2, ignore_index=True, sort=sort)
if sort:
expected = expected[['end_time', 'start_time']]
else:
expected = expected[['start_time', 'end_time']]
assert_frame_equal(result, expected)
def test_append_missing_column_proper_upcast(self, sort):
df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')})
df2 = DataFrame({'B': np.array([True, False, True, False],
dtype=bool)})
appended = df1.append(df2, ignore_index=True, sort=sort)
assert appended['A'].dtype == 'f8'
assert appended['B'].dtype == 'O'
class TestConcatenate(ConcatenateBase):
def test_concat_copy(self):
df = DataFrame(np.random.randn(4, 3))
df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1))
df3 = DataFrame({5: 'foo'}, index=range(4))
# These are actual copies.
result = concat([df, df2, df3], axis=1, copy=True)
for b in result._data.blocks:
assert b.values.base is None
# These are the same.
result = concat([df, df2, df3], axis=1, copy=False)
for b in result._data.blocks:
if b.is_float:
assert b.values.base is df._data.blocks[0].values.base
elif b.is_integer:
assert b.values.base is df2._data.blocks[0].values.base
elif b.is_object:
assert b.values.base is not None
# Float block was consolidated.
df4 = DataFrame(np.random.randn(4, 1))
result = concat([df, df2, df3, df4], axis=1, copy=False)
for b in result._data.blocks:
if b.is_float:
assert b.values.base is None
elif b.is_integer:
assert b.values.base is df2._data.blocks[0].values.base
elif b.is_object:
assert b.values.base is not None
def test_concat_with_group_keys(self):
df = DataFrame(np.random.randn(4, 3))
df2 = DataFrame(np.random.randn(4, 4))
# axis=0
df = DataFrame(np.random.randn(3, 4))
df2 = DataFrame(np.random.randn(4, 4))
result = concat([df, df2], keys=[0, 1])
exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1],
[0, 1, 2, 0, 1, 2, 3]])
expected = DataFrame(np.r_[df.values, df2.values],
index=exp_index)
tm.assert_frame_equal(result, expected)
result = concat([df, df], keys=[0, 1])
exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1],
[0, 1, 2, 0, 1, 2]])
expected = DataFrame(np.r_[df.values, df.values],
index=exp_index2)
tm.assert_frame_equal(result, expected)
# axis=1
df = DataFrame(np.random.randn(4, 3))
df2 = DataFrame(np.random.randn(4, 4))
result = concat([df, df2], keys=[0, 1], axis=1)
expected = DataFrame(np.c_[df.values, df2.values],
columns=exp_index)
tm.assert_frame_equal(result, expected)
result = concat([df, df], keys=[0, 1], axis=1)
expected = DataFrame(np.c_[df.values, df.values],
columns=exp_index2)
tm.assert_frame_equal(result, expected)
def test_concat_keys_specific_levels(self):
df = DataFrame(np.random.randn(10, 4))
pieces = [df.iloc[:, [0, 1]], df.iloc[:, [2]], df.iloc[:, [3]]]
level = ['three', 'two', 'one', 'zero']
result = concat(pieces, axis=1, keys=['one', 'two', 'three'],
levels=[level],
names=['group_key'])
tm.assert_index_equal(result.columns.levels[0],
Index(level, name='group_key'))
assert result.columns.names[0] == 'group_key'
def test_concat_dataframe_keys_bug(self, sort):
t1 = DataFrame({
'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'],
name='id'))})
t2 = DataFrame({
'value': Series([7, 8], index=Index(['a', 'b'], name='id'))})
# it works
result = concat([t1, t2], axis=1, keys=['t1', 't2'], sort=sort)
assert list(result.columns) == [('t1', 'value'), ('t2', 'value')]
def test_concat_series_partial_columns_names(self):
# GH10698
foo = Series([1, 2], name='foo')
bar = Series([1, 2])
baz = Series([4, 5])
result = concat([foo, bar, baz], axis=1)
expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [
4, 5]}, columns=['foo', 0, 1])
tm.assert_frame_equal(result, expected)
result = concat([foo, bar, baz], axis=1, keys=[
'red', 'blue', 'yellow'])
expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [
4, 5]}, columns=['red', 'blue', 'yellow'])
tm.assert_frame_equal(result, expected)
result = concat([foo, bar, baz], axis=1, ignore_index=True)
expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
tm.assert_frame_equal(result, expected)
def test_concat_dict(self):
frames = {'foo': DataFrame(np.random.randn(4, 3)),
'bar': DataFrame(np.random.randn(4, 3)),
'baz': DataFrame(np.random.randn(4, 3)),
'qux': DataFrame(np.random.randn(4, 3))}
sorted_keys = sorted(frames)
result = concat(frames)
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys)
tm.assert_frame_equal(result, expected)
result = concat(frames, axis=1)
expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys,
axis=1)
tm.assert_frame_equal(result, expected)
keys = ['baz', 'foo', 'bar']
result = concat(frames, keys=keys)
expected = concat([frames[k] for k in keys], keys=keys)
tm.assert_frame_equal(result, expected)
def test_concat_ignore_index(self, sort):
frame1 = DataFrame({"test1": ["a", "b", "c"],
"test2": [1, 2, 3],
"test3": [4.5, 3.2, 1.2]})
frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]})
frame1.index = Index(["x", "y", "z"])
frame2.index = Index(["x", "y", "q"])
v1 = concat([frame1, frame2], axis=1,
ignore_index=True, sort=sort)
nan = np.nan
expected = DataFrame([[nan, nan, nan, 4.3],
['a', 1, 4.5, 5.2],
['b', 2, 3.2, 2.2],
['c', 3, 1.2, nan]],
index=Index(["q", "x", "y", "z"]))
if not sort:
expected = expected.loc[['x', 'y', 'z', 'q']]
tm.assert_frame_equal(v1, expected)
def test_concat_multiindex_with_keys(self):
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])
frame = DataFrame(np.random.randn(10, 3), index=index,
columns=Index(['A', 'B', 'C'], name='exp'))
result = concat([frame, frame], keys=[0, 1], names=['iteration'])
assert result.index.names == ('iteration',) + index.names
tm.assert_frame_equal(result.loc[0], frame)
tm.assert_frame_equal(result.loc[1], frame)
assert result.index.nlevels == 3
def test_concat_multiindex_with_tz(self):
# GH 6606
df = DataFrame({'dt': [datetime(2014, 1, 1),
datetime(2014, 1, 2),
datetime(2014, 1, 3)],
'b': ['A', 'B', 'C'],
'c': [1, 2, 3], 'd': [4, 5, 6]})
df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific'))
df = df.set_index(['dt', 'b'])
exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02',
'2014-01-03'] * 2,
tz='US/Pacific', name='dt')
exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b')
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2},
index=exp_idx, columns=['c', 'd'])
result = concat([df, df])
tm.assert_frame_equal(result, expected)
def test_concat_multiindex_with_none_in_index_names(self):
# GH 15787
index = pd.MultiIndex.from_product([[1], range(5)],
names=['level1', None])
df = pd.DataFrame({'col': range(5)}, index=index, dtype=np.int32)
result = concat([df, df], keys=[1, 2], names=['level2'])
index = pd.MultiIndex.from_product([[1, 2], [1], range(5)],
names=['level2', 'level1', None])
expected = pd.DataFrame({'col': list(range(5)) * 2},
index=index, dtype=np.int32)
assert_frame_equal(result, expected)
result = concat([df, df[:2]], keys=[1, 2], names=['level2'])
level2 = [1] * 5 + [2] * 2
level1 = [1] * 7
no_name = list(range(5)) + list(range(2))
tuples = list(zip(level2, level1, no_name))
index = pd.MultiIndex.from_tuples(tuples,
names=['level2', 'level1', None])
expected = pd.DataFrame({'col': no_name}, index=index,
dtype=np.int32)
assert_frame_equal(result, expected)
def test_concat_keys_and_levels(self):
df = DataFrame(np.random.randn(1, 3))
df2 = DataFrame(np.random.randn(1, 4))
levels = [['foo', 'baz'], ['one', 'two']]
names = ['first', 'second']
result = concat([df, df2, df, df2],
keys=[('foo', 'one'), ('foo', 'two'),
('baz', 'one'), ('baz', 'two')],
levels=levels,
names=names)
expected = concat([df, df2, df, df2])
exp_index = MultiIndex(levels=levels + [[0]],
labels=[[0, 0, 1, 1], [0, 1, 0, 1],
[0, 0, 0, 0]],
names=names + [None])
expected.index = exp_index
tm.assert_frame_equal(result, expected)
# no names
result = concat([df, df2, df, df2],
keys=[('foo', 'one'), ('foo', 'two'),
('baz', 'one'), ('baz', 'two')],
levels=levels)
assert result.index.names == (None,) * 3
# no levels
result = concat([df, df2, df, df2],
keys=[('foo', 'one'), ('foo', 'two'),
('baz', 'one'), ('baz', 'two')],
names=['first', 'second'])
assert result.index.names == ('first', 'second') + (None,)
tm.assert_index_equal(result.index.levels[0],
Index(['baz', 'foo'], name='first'))
def test_concat_keys_levels_no_overlap(self):
# GH #1406
df = DataFrame(np.random.randn(1, 3), index=['a'])
df2 = DataFrame(np.random.randn(1, 4), index=['b'])
pytest.raises(ValueError, concat, [df, df],
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
pytest.raises(ValueError, concat, [df, df2],
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
def test_concat_rename_index(self):
a = DataFrame(np.random.rand(3, 3),
columns=list('ABC'),
index=Index(list('abc'), name='index_a'))
b = DataFrame(np.random.rand(3, 3),
columns=list('ABC'),
index=Index(list('abc'), name='index_b'))
result = concat([a, b], keys=['key0', 'key1'],
names=['lvl0', 'lvl1'])
exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0'])
names = list(exp.index.names)
names[1] = 'lvl1'
exp.index.set_names(names, inplace=True)
tm.assert_frame_equal(result, exp)
assert result.index.names == exp.index.names
def test_crossed_dtypes_weird_corner(self):
columns = ['A', 'B', 'C', 'D']
df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'),
'B': np.array([1, 2, 3, 4], dtype='i8'),
'C': np.array([1, 2, 3, 4], dtype='f8'),
'D': np.array([1, 2, 3, 4], dtype='i8')},
columns=columns)
df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'),
'B': np.array([1, 2, 3, 4], dtype='f8'),
'C': np.array([1, 2, 3, 4], dtype='i8'),
'D': np.array([1, 2, 3, 4], dtype='f8')},
columns=columns)
appended = df1.append(df2, ignore_index=True)
expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0),
columns=columns)
tm.assert_frame_equal(appended, expected)
df = DataFrame(np.random.randn(1, 3), index=['a'])
df2 = DataFrame(np.random.randn(1, 4), index=['b'])
result = concat(
[df, df2], keys=['one', 'two'], names=['first', 'second'])
assert result.index.names == ('first', 'second')
def test_dups_index(self):
# GH 4771
# single dtypes
df = DataFrame(np.random.randint(0, 10, size=40).reshape(
10, 4), columns=['A', 'A', 'C', 'C'])
result = concat([df, df], axis=1)
assert_frame_equal(result.iloc[:, :4], df)
assert_frame_equal(result.iloc[:, 4:], df)
result = concat([df, df], axis=0)
assert_frame_equal(result.iloc[:10], df)
assert_frame_equal(result.iloc[10:], df)
# multi dtypes
df = concat([DataFrame(np.random.randn(10, 4),
columns=['A', 'A', 'B', 'B']),
DataFrame(np.random.randint(0, 10, size=20)
.reshape(10, 2),
columns=['A', 'C'])],
axis=1)
result = concat([df, df], axis=1)
assert_frame_equal(result.iloc[:, :6], df)
assert_frame_equal(result.iloc[:, 6:], df)
result = concat([df, df], axis=0)
assert_frame_equal(result.iloc[:10], df)
assert_frame_equal(result.iloc[10:], df)
# append
result = df.iloc[0:8, :].append(df.iloc[8:])
assert_frame_equal(result, df)
result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10])
assert_frame_equal(result, df)
expected = concat([df, df], axis=0)
result = df.append(df)
assert_frame_equal(result, expected)
def test_with_mixed_tuples(self, sort):
# 10697
# columns have mixed tuples, so handle properly
df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2))
df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2))
# it works
concat([df1, df2], sort=sort)
def test_handle_empty_objects(self, sort):
df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
baz = df[:5].copy()
baz['foo'] = 'bar'
empty = df[5:5]
frames = [baz, empty, empty, df[5:]]
concatted = concat(frames, axis=0, sort=sort)
expected = df.reindex(columns=['a', 'b', 'c', 'd', 'foo'])
expected['foo'] = expected['foo'].astype('O')
expected.loc[0:4, 'foo'] = 'bar'
tm.assert_frame_equal(concatted, expected)
# empty as first element with time series
# GH3259
df = DataFrame(dict(A=range(10000)), index=date_range(
'20130101', periods=10000, freq='s'))
empty = DataFrame()
result = concat([df, empty], axis=1)
assert_frame_equal(result, df)
result = concat([empty, df], axis=1)
assert_frame_equal(result, df)
result = concat([df, empty])
assert_frame_equal(result, df)
result = concat([empty, df])
assert_frame_equal(result, df)
def test_concat_mixed_objs(self):
# concat mixed series/frames
# G2385
# axis 1
index = date_range('01-Jan-2013', periods=10, freq='H')
arr = np.arange(10, dtype='int64')
s1 = Series(arr, index=index)
s2 = Series(arr, index=index)
df = DataFrame(arr.reshape(-1, 1), index=index)
expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2),
index=index, columns=[0, 0])
result = concat([df, df], axis=1)
assert_frame_equal(result, expected)
expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2),
index=index, columns=[0, 1])
result = concat([s1, s2], axis=1)
assert_frame_equal(result, expected)
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
index=index, columns=[0, 1, 2])
result = concat([s1, s2, s1], axis=1)
assert_frame_equal(result, expected)
expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5),
index=index, columns=[0, 0, 1, 2, 3])
result = concat([s1, df, s2, s2, s1], axis=1)
assert_frame_equal(result, expected)
# with names
s1.name = 'foo'
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
index=index, columns=['foo', 0, 0])
result = concat([s1, df, s2], axis=1)
assert_frame_equal(result, expected)
s2.name = 'bar'
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
index=index, columns=['foo', 0, 'bar'])
result = concat([s1, df, s2], axis=1)
assert_frame_equal(result, expected)
# ignore index
expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3),
index=index, columns=[0, 1, 2])
result = concat([s1, df, s2], axis=1, ignore_index=True)
assert_frame_equal(result, expected)
# axis 0
expected = DataFrame(np.tile(arr, 3).reshape(-1, 1),
index=index.tolist() * 3, columns=[0])
result = concat([s1, df, s2])
assert_frame_equal(result, expected)
expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0])
result = concat([s1, df, s2], ignore_index=True)
assert_frame_equal(result, expected)
# invalid concatente of mixed dims
with catch_warnings(record=True):
panel = tm.makePanel()
pytest.raises(ValueError, lambda: concat([panel, s1], axis=1))
def test_empty_dtype_coerce(self):
# xref to #12411
# xref to #12045
# xref to #11594
# see below
# 10571
df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b'])
df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b'])
result = concat([df1, df2])
expected = df1.dtypes
tm.assert_series_equal(result.dtypes, expected)
def test_dtype_coerceion(self):
# 12411
df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'),
pd.NaT]})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
# 12045
import datetime
df = DataFrame({'date': [datetime.datetime(2012, 1, 1),
datetime.datetime(1012, 1, 2)]})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
# 11594
df = DataFrame({'text': ['some words'] + [None] * 9})
result = concat([df.iloc[[0]], df.iloc[[1]]])
tm.assert_series_equal(result.dtypes, df.dtypes)
def test_panel_concat_other_axes(self):
with catch_warnings(record=True):
panel = tm.makePanel()
p1 = panel.iloc[:, :5, :]
p2 = panel.iloc[:, 5:, :]
result = concat([p1, p2], axis=1)
tm.assert_panel_equal(result, panel)
p1 = panel.iloc[:, :, :2]
p2 = panel.iloc[:, :, 2:]
result = concat([p1, p2], axis=2)
tm.assert_panel_equal(result, panel)
# if things are a bit misbehaved
p1 = panel.iloc[:2, :, :2]
p2 = panel.iloc[:, :, 2:]
p1['ItemC'] = 'baz'
result = concat([p1, p2], axis=2)
expected = panel.copy()
expected['ItemC'] = expected['ItemC'].astype('O')
expected.loc['ItemC', :, :2] = 'baz'
tm.assert_panel_equal(result, expected)
def test_panel_concat_buglet(self, sort):
with catch_warnings(record=True):
# #2257
def make_panel():
index = 5
cols = 3
def df():
return DataFrame(np.random.randn(index, cols),
index=["I%s" % i for i in range(index)],
columns=["C%s" % i for i in range(cols)])
return Panel(dict(("Item%s" % x, df())
for x in ['A', 'B', 'C']))
panel1 = make_panel()
panel2 = make_panel()
panel2 = panel2.rename_axis(dict((x, "%s_1" % x)
for x in panel2.major_axis),
axis=1)
panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1)
panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2)
# it works!
concat([panel1, panel3], axis=1, verify_integrity=True, sort=sort)
def test_concat_series(self):
ts = tm.makeTimeSeries()
ts.name = 'foo'
pieces = [ts[:5], ts[5:15], ts[15:]]
result = concat(pieces)
tm.assert_series_equal(result, ts)
assert result.name == ts.name
result = concat(pieces, keys=[0, 1, 2])
expected = ts.copy()
ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]'))
exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]),
np.arange(len(ts))]
exp_index = MultiIndex(levels=[[0, 1, 2], ts.index],
labels=exp_labels)
expected.index = exp_index
tm.assert_series_equal(result, expected)
def test_concat_series_axis1(self, sort=sort):
ts = tm.makeTimeSeries()
pieces = [ts[:-2], ts[2:], ts[2:-2]]
result = concat(pieces, axis=1)
expected = DataFrame(pieces).T
assert_frame_equal(result, expected)
result = concat(pieces, keys=['A', 'B', 'C'], axis=1)
expected = DataFrame(pieces, index=['A', 'B', 'C']).T
assert_frame_equal(result, expected)
# preserve series names, #2489
s = Series(randn(5), name='A')
s2 = Series(randn(5), name='B')
result = concat([s, s2], axis=1)
expected = DataFrame({'A': s, 'B': s2})
assert_frame_equal(result, expected)
s2.name = None
result = concat([s, s2], axis=1)
tm.assert_index_equal(result.columns,
Index(['A', 0], dtype='object'))
# must reindex, #2603
s = Series(randn(3), index=['c', 'a', 'b'], name='A')
s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B')
result = concat([s, s2], axis=1, sort=sort)
expected = DataFrame({'A': s, 'B': s2})
assert_frame_equal(result, expected)
def test_concat_single_with_key(self):
df = DataFrame(np.random.randn(10, 4))
result = concat([df], keys=['foo'])
expected = concat([df, df], keys=['foo', 'bar'])
tm.assert_frame_equal(result, expected[:10])
def test_concat_exclude_none(self):
df = DataFrame(np.random.randn(10, 4))
pieces = [df[:5], None, None, df[5:]]
result = concat(pieces)
tm.assert_frame_equal(result, df)
pytest.raises(ValueError, concat, [None, None])
def test_concat_datetime64_block(self):
from pandas.core.indexes.datetimes import date_range
rng = date_range('1/1/2000', periods=10)
df = DataFrame({'time': rng})
result = concat([df, df])
assert (result.iloc[:10]['time'] == rng).all()
assert (result.iloc[10:]['time'] == rng).all()
def test_concat_timedelta64_block(self):
from pandas import to_timedelta
rng = to_timedelta(np.arange(10), unit='s')
df = DataFrame({'time': rng})
result = concat([df, df])
assert (result.iloc[:10]['time'] == rng).all()
assert (result.iloc[10:]['time'] == rng).all()
def test_concat_keys_with_none(self):
# #1649
df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]])
result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0))
expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0))
tm.assert_frame_equal(result, expected)
result = concat([None, df0, df0[:2], df0[:1], df0],
keys=['a', 'b', 'c', 'd', 'e'])
expected = concat([df0, df0[:2], df0[:1], df0],
keys=['b', 'c', 'd', 'e'])
tm.assert_frame_equal(result, expected)
def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
# to join with union
# these two are of different length!
left = concat([ts1, ts2], join='outer', axis=1)
right = concat([ts2, ts1], join='outer', axis=1)
assert len(left) == len(right)
def test_concat_bug_2972(self):
ts0 = Series(np.zeros(5))
ts1 = Series(np.ones(5))
ts0.name = ts1.name = 'same name'
result = concat([ts0, ts1], axis=1)
expected = DataFrame({0: ts0, 1: ts1})
expected.columns = ['same name', 'same name']
assert_frame_equal(result, expected)
def test_concat_bug_3602(self):
# GH 3602, duplicate columns
df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'prc': [6, 6, 6, 6],
'stringvar': ['rrr', 'rrr', 'rrr', 'rrr']})
df2 = DataFrame({'C': [9, 10, 11, 12], 'misc': [1, 2, 3, 4],
'prc': [6, 6, 6, 6]})
expected = DataFrame([[0, 6, 'rrr', 9, 1, 6],
[0, 6, 'rrr', 10, 2, 6],
[0, 6, 'rrr', 11, 3, 6],
[0, 6, 'rrr', 12, 4, 6]])
expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc']
result = concat([df1, df2], axis=1)
assert_frame_equal(result, expected)
def test_concat_inner_join_empty(self):
# GH 15328
df_empty = pd.DataFrame()
df_a = pd.DataFrame({'a': [1, 2]}, index=[0, 1], dtype='int64')
df_expected = pd.DataFrame({'a': []}, index=[], dtype='int64')
for how, expected in [('inner', df_expected), ('outer', df_a)]:
result = pd.concat([df_a, df_empty], axis=1, join=how)
assert_frame_equal(result, expected)
def test_concat_series_axis1_same_names_ignore_index(self):
dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1]
s1 = Series(randn(len(dates)), index=dates, name='value')
s2 = Series(randn(len(dates)), index=dates, name='value')
result = concat([s1, s2], axis=1, ignore_index=True)
expected = Index([0, 1])
tm.assert_index_equal(result.columns, expected)
def test_concat_iterables(self):
from collections import deque, Iterable
# GH8645 check concat works with tuples, list, generators, and weird
# stuff like deque and custom iterables
df1 = DataFrame([1, 2, 3])
df2 = DataFrame([4, 5, 6])
expected = DataFrame([1, 2, 3, 4, 5, 6])
assert_frame_equal(concat((df1, df2), ignore_index=True), expected)
assert_frame_equal(concat([df1, df2], ignore_index=True), expected)
assert_frame_equal(concat((df for df in (df1, df2)),
ignore_index=True), expected)
assert_frame_equal(
concat(deque((df1, df2)), ignore_index=True), expected)
class CustomIterator1(object):
def __len__(self):
return 2
def __getitem__(self, index):
try:
return {0: df1, 1: df2}[index]
except KeyError:
raise IndexError
assert_frame_equal(pd.concat(CustomIterator1(),
ignore_index=True), expected)
class CustomIterator2(Iterable):
def __iter__(self):
yield df1
yield df2
assert_frame_equal(pd.concat(CustomIterator2(),
ignore_index=True), expected)
def test_concat_invalid(self):
# trying to concat a ndframe with a non-ndframe
df1 = mkdf(10, 2)
for obj in [1, dict(), [1, 2], (1, 2)]:
pytest.raises(TypeError, lambda x: concat([df1, obj]))
def test_concat_invalid_first_argument(self):
df1 = mkdf(10, 2)
df2 = mkdf(10, 2)
pytest.raises(TypeError, concat, df1, df2)
# generator ok though
concat(DataFrame(np.random.rand(5, 5)) for _ in range(3))
# text reader ok
# GH6583
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
reader = read_csv(StringIO(data), chunksize=1)
result = concat(reader, ignore_index=True)
expected = read_csv(StringIO(data))
assert_frame_equal(result, expected)
def test_concat_NaT_series(self):
# GH 11693
# test for merging NaT series with datetime series.
x = Series(date_range('20151124 08:00', '20151124 09:00',
freq='1h', tz='US/Eastern'))
y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
expected = Series([x[0], x[1], pd.NaT, pd.NaT])
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT with tz
expected = Series(pd.NaT, index=range(4),
dtype='datetime64[ns, US/Eastern]')
result = pd.concat([y, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# without tz
x = pd.Series(pd.date_range('20151124 08:00',
'20151124 09:00', freq='1h'))
y = pd.Series(pd.date_range('20151124 10:00',
'20151124 11:00', freq='1h'))
y[:] = pd.NaT
expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT])
result = pd.concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# all NaT without tz
x[:] = pd.NaT
expected = pd.Series(pd.NaT, index=range(4),
dtype='datetime64[ns]')
result = pd.concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
def test_concat_tz_frame(self):
df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'),
B=pd.Timestamp('20130603', tz='CET')),
index=range(5))
# concat
df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1)
assert_frame_equal(df2, df3)
def test_concat_tz_series(self):
# gh-11755: tz and no tz
x = Series(date_range('20151124 08:00',
'20151124 09:00',
freq='1h', tz='UTC'))
y = Series(date_range('2012-01-01', '2012-01-02'))
expected = Series([x[0], x[1], y[0], y[1]],
dtype='object')
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# gh-11887: concat tz and object
x = Series(date_range('20151124 08:00',
'20151124 09:00',
freq='1h', tz='UTC'))
y = Series(['a', 'b'])
expected = Series([x[0], x[1], y[0], y[1]],
dtype='object')
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
# see gh-12217 and gh-12306
# Concatenating two UTC times
first = pd.DataFrame([[datetime(2016, 1, 1)]])
first[0] = first[0].dt.tz_localize('UTC')
second = pd.DataFrame([[datetime(2016, 1, 2)]])
second[0] = second[0].dt.tz_localize('UTC')
result = pd.concat([first, second])
assert result[0].dtype == 'datetime64[ns, UTC]'
# Concatenating two London times
first = pd.DataFrame([[datetime(2016, 1, 1)]])
first[0] = first[0].dt.tz_localize('Europe/London')
second = pd.DataFrame([[datetime(2016, 1, 2)]])
second[0] = second[0].dt.tz_localize('Europe/London')
result = pd.concat([first, second])
assert result[0].dtype == 'datetime64[ns, Europe/London]'
# Concatenating 2+1 London times
first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]])
first[0] = first[0].dt.tz_localize('Europe/London')
second = pd.DataFrame([[datetime(2016, 1, 3)]])
second[0] = second[0].dt.tz_localize('Europe/London')
result = pd.concat([first, second])
assert result[0].dtype == 'datetime64[ns, Europe/London]'
# Concat'ing 1+2 London times
first = pd.DataFrame([[datetime(2016, 1, 1)]])
first[0] = first[0].dt.tz_localize('Europe/London')
second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]])
second[0] = second[0].dt.tz_localize('Europe/London')
result = pd.concat([first, second])
assert result[0].dtype == 'datetime64[ns, Europe/London]'
def test_concat_tz_series_with_datetimelike(self):
# see gh-12620: tz and timedelta
x = [pd.Timestamp('2011-01-01', tz='US/Eastern'),
pd.Timestamp('2011-02-01', tz='US/Eastern')]
y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')]
result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
tm.assert_series_equal(result, pd.Series(x + y, dtype='object'))
# tz and period
y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')]
result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
tm.assert_series_equal(result, pd.Series(x + y, dtype='object'))
def test_concat_tz_series_tzlocal(self):
# see gh-13583
x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()),
pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())]
y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()),
pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())]
result = concat([pd.Series(x), pd.Series(y)], ignore_index=True)
tm.assert_series_equal(result, pd.Series(x + y))
assert result.dtype == 'datetime64[ns, tzlocal()]'
@pytest.mark.parametrize('tz1', [None, 'UTC'])
@pytest.mark.parametrize('tz2', [None, 'UTC'])
@pytest.mark.parametrize('s', [pd.NaT, pd.Timestamp('20150101')])
def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, s):
# GH 12396
# tz-naive
first = pd.DataFrame([[pd.NaT], [pd.NaT]]).apply(
lambda x: x.dt.tz_localize(tz1))
second = pd.DataFrame([s]).apply(lambda x: x.dt.tz_localize(tz2))
result = pd.concat([first, second], axis=0)
expected = pd.DataFrame(pd.Series(
[pd.NaT, pd.NaT, s], index=[0, 1, 0]))
expected = expected.apply(lambda x: x.dt.tz_localize(tz2))
if tz1 != tz2:
expected = expected.astype(object)
assert_frame_equal(result, expected)
@pytest.mark.parametrize('tz1', [None, 'UTC'])
@pytest.mark.parametrize('tz2', [None, 'UTC'])
def test_concat_NaT_dataframes_all_NaT_axis_1(self, tz1, tz2):
# GH 12396
first = pd.DataFrame(pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1))
second = pd.DataFrame(pd.Series(
[pd.NaT]).dt.tz_localize(tz2), columns=[1])
expected = pd.DataFrame(
{0: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1),
1: pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz2)}
)
result = pd.concat([first, second], axis=1)
assert_frame_equal(result, expected)
@pytest.mark.parametrize('tz1', [None, 'UTC'])
@pytest.mark.parametrize('tz2', [None, 'UTC'])
def test_concat_NaT_series_dataframe_all_NaT(self, tz1, tz2):
# GH 12396
# tz-naive
first = pd.Series([pd.NaT, pd.NaT]).dt.tz_localize(tz1)
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz2)],
[pd.Timestamp('2016/01/01', tz=tz2)]],
index=[2, 3])
expected = pd.DataFrame([pd.NaT, pd.NaT,
pd.Timestamp('2015/01/01', tz=tz2),
pd.Timestamp('2016/01/01', tz=tz2)])
if tz1 != tz2:
expected = expected.astype(object)
result = pd.concat([first, second])
assert_frame_equal(result, expected)
@pytest.mark.parametrize('tz', [None, 'UTC'])
def test_concat_NaT_dataframes(self, tz):
# GH 12396
first = pd.DataFrame([[pd.NaT], [pd.NaT]])
first = first.apply(lambda x: x.dt.tz_localize(tz))
second = pd.DataFrame([[pd.Timestamp('2015/01/01', tz=tz)],
[pd.Timestamp('2016/01/01', tz=tz)]],
index=[2, 3])
expected = pd.DataFrame([pd.NaT, pd.NaT,
pd.Timestamp('2015/01/01', tz=tz),
pd.Timestamp('2016/01/01', tz=tz)])
result = pd.concat([first, second], axis=0)
assert_frame_equal(result, expected)
def test_concat_period_series(self):
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D'))
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == 'object'
# different freq
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M'))
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == 'object'
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M'))
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == 'object'
# non-period
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01']))
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == 'object'
x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D'))
y = Series(['A', 'B'])
expected = Series([x[0], x[1], y[0], y[1]], dtype='object')
result = concat([x, y], ignore_index=True)
tm.assert_series_equal(result, expected)
assert result.dtype == 'object'
def test_concat_empty_series(self):
# GH 11082
s1 = pd.Series([1, 2, 3], name='x')
s2 = pd.Series(name='y')
res = pd.concat([s1, s2], axis=1)
exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]})
tm.assert_frame_equal(res, exp)
s1 = pd.Series([1, 2, 3], name='x')
s2 = pd.Series(name='y')
res = pd.concat([s1, s2], axis=0)
# name will be reset
exp = pd.Series([1, 2, 3])
tm.assert_series_equal(res, exp)
# empty Series with no name
s1 = pd.Series([1, 2, 3], name='x')
s2 = pd.Series(name=None)
res = pd.concat([s1, s2], axis=1)
exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]},
columns=['x', 0])
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize('tz', [None, 'UTC'])
@pytest.mark.parametrize('values', [[], [1, 2, 3]])
def test_concat_empty_series_timelike(self, tz, values):
# GH 18447
first = Series([], dtype='M8[ns]').dt.tz_localize(tz)
second = Series(values)
expected = DataFrame(
{0: pd.Series([pd.NaT] * len(values),
dtype='M8[ns]'
).dt.tz_localize(tz),
1: values})
result = concat([first, second], axis=1)
assert_frame_equal(result, expected)
def test_default_index(self):
# is_series and ignore_index
s1 = pd.Series([1, 2, 3], name='x')
s2 = pd.Series([4, 5, 6], name='y')
res = pd.concat([s1, s2], axis=1, ignore_index=True)
assert isinstance(res.columns, pd.RangeIndex)
exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]])
# use check_index_type=True to check the result have
# RangeIndex (default index)
tm.assert_frame_equal(res, exp, check_index_type=True,
check_column_type=True)
# is_series and all inputs have no names
s1 = pd.Series([1, 2, 3])
s2 = pd.Series([4, 5, 6])
res = pd.concat([s1, s2], axis=1, ignore_index=False)
assert isinstance(res.columns, pd.RangeIndex)
exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]])
exp.columns = pd.RangeIndex(2)
tm.assert_frame_equal(res, exp, check_index_type=True,
check_column_type=True)
# is_dataframe and ignore_index
df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]})
df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]})
res = pd.concat([df1, df2], axis=0, ignore_index=True)
exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]],
columns=['A', 'B'])
tm.assert_frame_equal(res, exp, check_index_type=True,
check_column_type=True)
res = pd.concat([df1, df2], axis=1, ignore_index=True)
exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]])
tm.assert_frame_equal(res, exp, check_index_type=True,
check_column_type=True)
def test_concat_multiindex_rangeindex(self):
# GH13542
# when multi-index levels are RangeIndex objects
# there is a bug in concat with objects of len 1
df = DataFrame(np.random.randn(9, 2))
df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)],
labels=[np.repeat(np.arange(3), 3),
np.tile(np.arange(3), 3)])
res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]])
exp = df.iloc[[2, 3, 4, 5], :]
tm.assert_frame_equal(res, exp)
def test_concat_multiindex_dfs_with_deepcopy(self):
# GH 9967
from copy import deepcopy
example_multiindex1 = pd.MultiIndex.from_product([['a'], ['b']])
example_dataframe1 = pd.DataFrame([0], index=example_multiindex1)
example_multiindex2 = pd.MultiIndex.from_product([['a'], ['c']])
example_dataframe2 = pd.DataFrame([1], index=example_multiindex2)
example_dict = {'s1': example_dataframe1, 's2': example_dataframe2}
expected_index = pd.MultiIndex(levels=[['s1', 's2'],
['a'],
['b', 'c']],
labels=[[0, 1], [0, 0], [0, 1]],
names=['testname', None, None])
expected = pd.DataFrame([[0], [1]], index=expected_index)
result_copy = pd.concat(deepcopy(example_dict), names=['testname'])
tm.assert_frame_equal(result_copy, expected)
result_no_copy = pd.concat(example_dict, names=['testname'])
tm.assert_frame_equal(result_no_copy, expected)
def test_categorical_concat_append(self):
cat = Categorical(["a", "b"], categories=["a", "b"])
vals = [1, 2]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical(["a", "b", "a", "b"], categories=["a", "b"])
vals2 = [1, 2, 1, 2]
exp = DataFrame({"cats": cat2, "vals": vals2},
index=Index([0, 1, 0, 1]))
tm.assert_frame_equal(pd.concat([df, df]), exp)
tm.assert_frame_equal(df.append(df), exp)
# GH 13524 can concat different categories
cat3 = Categorical(["a", "b"], categories=["a", "b", "c"])
vals3 = [1, 2]
df_different_categories = DataFrame({"cats": cat3, "vals": vals3})
res = pd.concat([df, df_different_categories], ignore_index=True)
exp = DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]})
tm.assert_frame_equal(res, exp)
res = df.append(df_different_categories, ignore_index=True)
tm.assert_frame_equal(res, exp)
def test_categorical_concat_dtypes(self):
# GH8143
index = ['cat', 'obj', 'num']
cat = Categorical(['a', 'b', 'c'])
obj = Series(['a', 'b', 'c'])
num = Series([1, 2, 3])
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
result = df.dtypes == 'object'
expected = Series([False, True, False], index=index)
tm.assert_series_equal(result, expected)
result = df.dtypes == 'int64'
expected = Series([False, False, True], index=index)
tm.assert_series_equal(result, expected)
result = df.dtypes == 'category'
expected = Series([True, False, False], index=index)
tm.assert_series_equal(result, expected)
def test_categorical_concat(self, sort):
# See GH 10177
df1 = DataFrame(np.arange(18, dtype='int64').reshape(6, 3),
columns=["a", "b", "c"])
df2 = DataFrame(np.arange(14, dtype='int64').reshape(7, 2),
columns=["a", "c"])
cat_values = ["one", "one", "two", "one", "two", "two", "one"]
df2['h'] = Series(Categorical(cat_values))
res = pd.concat((df1, df2), axis=0, ignore_index=True, sort=sort)
exp = DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan,
np.nan, np.nan, np.nan, np.nan],
'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13],
'h': [None] * 6 + cat_values})
tm.assert_frame_equal(res, exp)
def test_categorical_concat_gh7864(self):
# GH 7864
# make sure ordering is preserverd
df = DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": list('abbaae')})
df["grade"] = Categorical(df["raw_grade"])
df['grade'].cat.set_categories(['e', 'a', 'b'])
df1 = df[0:3]
df2 = df[3:]
tm.assert_index_equal(df['grade'].cat.categories,
df1['grade'].cat.categories)
tm.assert_index_equal(df['grade'].cat.categories,
df2['grade'].cat.categories)
dfx = pd.concat([df1, df2])
tm.assert_index_equal(df['grade'].cat.categories,
dfx['grade'].cat.categories)
dfa = df1.append(df2)
tm.assert_index_equal(df['grade'].cat.categories,
dfa['grade'].cat.categories)
def test_categorical_concat_preserve(self):
# GH 8641 series concat not preserving category dtype
# GH 13524 can concat different categories
s = Series(list('abc'), dtype='category')
s2 = Series(list('abd'), dtype='category')
exp = Series(list('abcabd'))
res = pd.concat([s, s2], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series(list('abcabc'), dtype='category')
res = pd.concat([s, s], ignore_index=True)
tm.assert_series_equal(res, exp)
exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2],
dtype='category')
res = pd.concat([s, s])
tm.assert_series_equal(res, exp)
a = Series(np.arange(6, dtype='int64'))
b = Series(list('aabbca'))
df2 = DataFrame({'A': a,
'B': b.astype(CategoricalDtype(list('cab')))})
res = pd.concat([df2, df2])
exp = DataFrame(
{'A': pd.concat([a, a]),
'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))})
tm.assert_frame_equal(res, exp)
def test_categorical_index_preserver(self):
a = Series(np.arange(6, dtype='int64'))
b = Series(list('aabbca'))
df2 = DataFrame({'A': a,
'B': b.astype(CategoricalDtype(list('cab')))
}).set_index('B')
result = pd.concat([df2, df2])
expected = DataFrame(
{'A': pd.concat([a, a]),
'B': pd.concat([b, b]).astype(CategoricalDtype(list('cab')))
}).set_index('B')
tm.assert_frame_equal(result, expected)
# wrong catgories
df3 = DataFrame({'A': a, 'B': Categorical(b, categories=list('abe'))
}).set_index('B')
pytest.raises(TypeError, lambda: pd.concat([df2, df3]))
def test_concat_categoricalindex(self):
# GH 16111, categories that aren't lexsorted
categories = [9, 0, 1, 2, 3]
a = pd.Series(1, index=pd.CategoricalIndex([9, 0],
categories=categories))
b = pd.Series(2, index=pd.CategoricalIndex([0, 1],
categories=categories))
c = pd.Series(3, index=pd.CategoricalIndex([1, 2],
categories=categories))
result = pd.concat([a, b, c], axis=1)
exp_idx = pd.CategoricalIndex([0, 1, 2, 9])
exp = pd.DataFrame({0: [1, np.nan, np.nan, 1],
1: [2, 2, np.nan, np.nan],
2: [np.nan, 3, 3, np.nan]},
columns=[0, 1, 2],
index=exp_idx)
tm.assert_frame_equal(result, exp)
def test_concat_order(self):
# GH 17344
dfs = [pd.DataFrame(index=range(3), columns=['a', 1, None])]
dfs += [pd.DataFrame(index=range(3), columns=[None, 1, 'a'])
for i in range(100)]
result = pd.concat(dfs, sort=True).columns
if PY2:
# Different sort order between incomparable objects between
# python 2 and python3 via Index.union.
expected = dfs[1].columns
else:
expected = dfs[0].columns
tm.assert_index_equal(result, expected)
def test_concat_datetime_timezone(self):
# GH 18523
idx1 = pd.date_range('2011-01-01', periods=3, freq='H',
tz='Europe/Paris')
idx2 = pd.date_range(start=idx1[0], end=idx1[-1], freq='H')
df1 = pd.DataFrame({'a': [1, 2, 3]}, index=idx1)
df2 = pd.DataFrame({'b': [1, 2, 3]}, index=idx2)
result = pd.concat([df1, df2], axis=1)
exp_idx = DatetimeIndex(['2011-01-01 00:00:00+01:00',
'2011-01-01 01:00:00+01:00',
'2011-01-01 02:00:00+01:00'],
freq='H'
).tz_localize('UTC').tz_convert('Europe/Paris')
expected = pd.DataFrame([[1, 1], [2, 2], [3, 3]],
index=exp_idx, columns=['a', 'b'])
tm.assert_frame_equal(result, expected)
idx3 = pd.date_range('2011-01-01', periods=3,
freq='H', tz='Asia/Tokyo')
df3 = pd.DataFrame({'b': [1, 2, 3]}, index=idx3)
result = pd.concat([df1, df3], axis=1)
exp_idx = DatetimeIndex(['2010-12-31 15:00:00+00:00',
'2010-12-31 16:00:00+00:00',
'2010-12-31 17:00:00+00:00',
'2010-12-31 23:00:00+00:00',
'2011-01-01 00:00:00+00:00',
'2011-01-01 01:00:00+00:00']
).tz_localize('UTC')
expected = pd.DataFrame([[np.nan, 1], [np.nan, 2], [np.nan, 3],
[1, np.nan], [2, np.nan], [3, np.nan]],
index=exp_idx, columns=['a', 'b'])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
@pytest.mark.parametrize('dt', np.sctypes['float'])
def test_concat_no_unnecessary_upcast(dt, pdt):
with catch_warnings(record=True):
# GH 13247
dims = pdt().ndim
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
pdt(np.array([np.nan], dtype=dt, ndmin=dims)),
pdt(np.array([5], dtype=dt, ndmin=dims))]
x = pd.concat(dfs)
assert x.values.dtype == dt
@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
@pytest.mark.parametrize('dt', np.sctypes['int'])
def test_concat_will_upcast(dt, pdt):
with catch_warnings(record=True):
dims = pdt().ndim
dfs = [pdt(np.array([1], dtype=dt, ndmin=dims)),
pdt(np.array([np.nan], ndmin=dims)),
pdt(np.array([5], dtype=dt, ndmin=dims))]
x = pd.concat(dfs)
assert x.values.dtype == 'float64'
def test_concat_empty_and_non_empty_frame_regression():
# GH 18178 regression test
df1 = pd.DataFrame({'foo': [1]})
df2 = pd.DataFrame({'foo': []})
expected = pd.DataFrame({'foo': [1.0]})
result = pd.concat([df1, df2])
assert_frame_equal(result, expected)
def test_concat_empty_and_non_empty_series_regression():
# GH 18187 regression test
s1 = pd.Series([1])
s2 = pd.Series([])
expected = s1
result = pd.concat([s1, s2])
tm.assert_series_equal(result, expected)
def test_concat_sorts_columns(sort_with_none):
# GH-4588
df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
df2 = pd.DataFrame({"a": [3, 4], "c": [5, 6]})
# for sort=True/None
expected = pd.DataFrame({"a": [1, 2, 3, 4],
"b": [1, 2, None, None],
"c": [None, None, 5, 6]},
columns=['a', 'b', 'c'])
if sort_with_none is False:
expected = expected[['b', 'a', 'c']]
if sort_with_none is None:
# only warn if not explicitly specified
ctx = tm.assert_produces_warning(FutureWarning)
else:
ctx = tm.assert_produces_warning(None)
# default
with ctx:
result = pd.concat([df1, df2], ignore_index=True, sort=sort_with_none)
tm.assert_frame_equal(result, expected)
def test_concat_sorts_index(sort_with_none):
df1 = pd.DataFrame({"a": [1, 2, 3]}, index=['c', 'a', 'b'])
df2 = pd.DataFrame({"b": [1, 2]}, index=['a', 'b'])
# For True/None
expected = pd.DataFrame({"a": [2, 3, 1], "b": [1, 2, None]},
index=['a', 'b', 'c'],
columns=['a', 'b'])
if sort_with_none is False:
expected = expected.loc[['c', 'a', 'b']]
if sort_with_none is None:
# only warn if not explicitly specified
ctx = tm.assert_produces_warning(FutureWarning)
else:
ctx = tm.assert_produces_warning(None)
# Warn and sort by default
with ctx:
result = pd.concat([df1, df2], axis=1, sort=sort_with_none)
tm.assert_frame_equal(result, expected)
def test_concat_inner_sort(sort_with_none):
# https://github.com/pandas-dev/pandas/pull/20613
df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 2]},
columns=['b', 'a', 'c'])
df2 = pd.DataFrame({"a": [1, 2], 'b': [3, 4]}, index=[3, 4])
with tm.assert_produces_warning(None):
# unset sort should *not* warn for inner join
# since that never sorted
result = pd.concat([df1, df2], sort=sort_with_none,
join='inner',
ignore_index=True)
expected = pd.DataFrame({"b": [1, 2, 3, 4], "a": [1, 2, 1, 2]},
columns=['b', 'a'])
if sort_with_none is True:
expected = expected[['a', 'b']]
tm.assert_frame_equal(result, expected)
def test_concat_aligned_sort():
# GH-4588
df = pd.DataFrame({"c": [1, 2], "b": [3, 4], 'a': [5, 6]},
columns=['c', 'b', 'a'])
result = pd.concat([df, df], sort=True, ignore_index=True)
expected = pd.DataFrame({'a': [5, 6, 5, 6], 'b': [3, 4, 3, 4],
'c': [1, 2, 1, 2]},
columns=['a', 'b', 'c'])
tm.assert_frame_equal(result, expected)
result = pd.concat([df, df[['c', 'b']]], join='inner', sort=True,
ignore_index=True)
expected = expected[['b', 'c']]
tm.assert_frame_equal(result, expected)
def test_concat_aligned_sort_does_not_raise():
# GH-4588
# We catch TypeErrors from sorting internally and do not re-raise.
df = pd.DataFrame({1: [1, 2], "a": [3, 4]}, columns=[1, 'a'])
expected = pd.DataFrame({1: [1, 2, 1, 2], 'a': [3, 4, 3, 4]},
columns=[1, 'a'])
result = pd.concat([df, df], ignore_index=True, sort=True)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("s1name,s2name", [
(np.int64(190), (43, 0)), (190, (43, 0))])
def test_concat_series_name_npscalar_tuple(s1name, s2name):
# GH21015
s1 = pd.Series({'a': 1, 'b': 2}, name=s1name)
s2 = pd.Series({'c': 5, 'd': 6}, name=s2name)
result = pd.concat([s1, s2])
expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6})
tm.assert_series_equal(result, expected)