# pylint: disable=E1103 from warnings import catch_warnings from numpy.random import randn import numpy as np import pytest import pandas as pd from pandas.compat import lrange import pandas.compat as compat from pandas.util.testing import assert_frame_equal from pandas import DataFrame, MultiIndex, Series, Index, merge, concat from pandas._libs import join as libjoin import pandas.util.testing as tm from pandas.tests.reshape.merge.test_merge import get_test_data, N, NGROUPS a_ = np.array class TestJoin(object): def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N)}) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5)}) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = libjoin.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = libjoin.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = libjoin.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key1.bar' in joined def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key2.bar' in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') tm.assert_series_equal(merged['MergedA'], target['A'], check_names=False) tm.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2]}) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') assert np.isnan(joined['two']['c']) assert np.isnan(joined['three']['c']) # merge column not p resent pytest.raises(KeyError, target.join, source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 pytest.raises(ValueError, target.join, source_copy, on='A') def test_join_on_fails_with_different_right_index(self): with pytest.raises(ValueError): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}, index=tm.makeCustomIndex(10, 2)) merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): with pytest.raises(ValueError): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}, index=tm.makeCustomIndex(10, 2)) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}) merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): with pytest.raises(ValueError): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}, index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b']) def test_join_on_fails_with_wrong_object_type(self): # GH12081 wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] df = DataFrame({'a': [1, 1]}) for obj in wrongly_typed: with tm.assert_raises_regex(ValueError, str(type(obj))): merge(obj, df, left_on='a', right_on='a') with tm.assert_raises_regex(ValueError, str(type(obj))): merge(df, obj, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notna()] tm.assert_series_equal(joined['key'], expected['key'], check_dtype=False) tm.assert_series_equal(joined['value'], expected['value'], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(10), columns=['A', 'B', 'C', 'D']) assert df1['B'].dtype == np.int64 assert df1['D'].dtype == np.bool_ df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two'] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap'] key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one'] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index assert joined.index.is_monotonic assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame( [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ('b', 'mean') in result assert 'b' in result def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) assert joined.dtypes['a'] == 'float64' assert joined.dtypes['b'] == 'float64' assert joined.dtypes['c'] == 'float32' a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) assert rs.dtypes['a'] == 'int64' assert rs.dtypes['b'] == 'float64' assert rs.dtypes['c'] == 'float32' assert rs.dtypes['md'] == 'float32' xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame( {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4]}) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c']}, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) tm.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame({'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan]}, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') df = DataFrame([i for i in range(len(index))], index=index, columns=['pnum']) df2 = concat([df, df]) result = df.join(df2, how='inner', rsuffix='_df2') expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9]] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) pytest.raises(ValueError, df_list[0].join, df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.loc[:, ['A', 'B']] df2 = df.loc[:, ['C', 'D']] df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20) .reshape(10, 2), columns=['A', 'C'])], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge( z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = ['x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] assert_frame_equal(dta, expected) def test_panel_join(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.iloc[:2, :10, :3] p2 = panel.iloc[2:, 5:, 2:] # left join result = p1.join(p2) expected = p1.copy() expected['ItemC'] = p2['ItemC'] tm.assert_panel_equal(result, expected) # right join result = p1.join(p2, how='right') expected = p2.copy() expected['ItemA'] = p1['ItemA'] expected['ItemB'] = p1['ItemB'] expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) tm.assert_panel_equal(result, expected) # inner join result = p1.join(p2, how='inner') expected = panel.iloc[:, 5:10, 2:3] tm.assert_panel_equal(result, expected) # outer join result = p1.join(p2, how='outer') expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis) expected = expected.join(p2.reindex(major=panel.major_axis, minor=panel.minor_axis)) tm.assert_panel_equal(result, expected) def test_panel_join_overlap(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']] p2 = panel.loc[['ItemB', 'ItemC']] # Expected index is # # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.loc[['ItemA']] expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): with catch_warnings(record=True): tm.K = 10 panel = tm.makePanel() tm.K = 4 panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) panels = [panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7]] data_dict = {} for p in panels: data_dict.update(p.iteritems()) joined = panels[0].join(panels[1:], how='inner') expected = pd.Panel.from_dict(data_dict, intersect=True) tm.assert_panel_equal(joined, expected) joined = panels[0].join(panels[1:], how='outer') expected = pd.Panel.from_dict(data_dict, intersect=False) tm.assert_panel_equal(joined, expected) # edge cases pytest.raises(ValueError, panels[0].join, panels[1:], how='outer', lsuffix='foo', rsuffix='bar') pytest.raises(ValueError, panels[0].join, panels[1:], how='right') def _check_join(left, right, result, join_col, how='left', lsuffix='_x', rsuffix='_y'): # some smoke tests for c in join_col: assert(result[c].notna().all()) left_grouped = left.groupby(join_col) right_grouped = right.groupby(join_col) for group_key, group in result.groupby(join_col): l_joined = _restrict_to_columns(group, left.columns, lsuffix) r_joined = _restrict_to_columns(group, right.columns, rsuffix) try: lgroup = left_grouped.get_group(group_key) except KeyError: if how in ('left', 'inner'): raise AssertionError('key %s should not have been in the join' % str(group_key)) _assert_all_na(l_joined, left.columns, join_col) else: _assert_same_contents(l_joined, lgroup) try: rgroup = right_grouped.get_group(group_key) except KeyError: if how in ('right', 'inner'): raise AssertionError('key %s should not have been in the join' % str(group_key)) _assert_all_na(r_joined, right.columns, join_col) else: _assert_same_contents(r_joined, rgroup) def _restrict_to_columns(group, columns, suffix): found = [c for c in group.columns if c in columns or c.replace(suffix, '') in columns] # filter group = group.loc[:, found] # get rid of suffixes, if any group = group.rename(columns=lambda x: x.replace(suffix, '')) # put in the right order... group = group.loc[:, columns] return group def _assert_same_contents(join_chunk, source): NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values svalues = source.fillna(NA_SENTINEL).drop_duplicates().values rows = {tuple(row) for row in jvalues} assert(len(rows) == len(source)) assert(all(tuple(row) in rows for row in svalues)) def _assert_all_na(join_chunk, source_columns, join_col): for c in source_columns: if c in join_col: continue assert(join_chunk[c].isna().all()) def _join_by_hand(a, b, how='left'): join_index = a.index.join(b.index, how=how) a_re = a.reindex(join_index) b_re = b.reindex(join_index) result_columns = a.columns.append(b.columns) for col, s in compat.iteritems(b_re): a_re[col] = s return a_re.reindex(columns=result_columns)