789 lines
30 KiB
Python
789 lines
30 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
from datetime import datetime
|
||
|
|
||
|
import numpy as np
|
||
|
from numpy import nan
|
||
|
|
||
|
import pandas as pd
|
||
|
|
||
|
from pandas import DataFrame, Index, Series, Timestamp, date_range
|
||
|
from pandas.compat import lrange
|
||
|
|
||
|
from pandas.tests.frame.common import TestData
|
||
|
|
||
|
import pandas.util.testing as tm
|
||
|
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||
|
|
||
|
|
||
|
class TestDataFrameConcatCommon(TestData):
|
||
|
|
||
|
def test_concat_multiple_frames_dtypes(self):
|
||
|
|
||
|
# GH 2759
|
||
|
A = DataFrame(data=np.ones((10, 2)), columns=[
|
||
|
'foo', 'bar'], dtype=np.float64)
|
||
|
B = DataFrame(data=np.ones((10, 2)), dtype=np.float32)
|
||
|
results = pd.concat((A, B), axis=1).get_dtype_counts()
|
||
|
expected = Series(dict(float64=2, float32=2))
|
||
|
assert_series_equal(results, expected)
|
||
|
|
||
|
def test_concat_multiple_tzs(self):
|
||
|
# GH 12467
|
||
|
# combining datetime tz-aware and naive DataFrames
|
||
|
ts1 = Timestamp('2015-01-01', tz=None)
|
||
|
ts2 = Timestamp('2015-01-01', tz='UTC')
|
||
|
ts3 = Timestamp('2015-01-01', tz='EST')
|
||
|
|
||
|
df1 = DataFrame(dict(time=[ts1]))
|
||
|
df2 = DataFrame(dict(time=[ts2]))
|
||
|
df3 = DataFrame(dict(time=[ts3]))
|
||
|
|
||
|
results = pd.concat([df1, df2]).reset_index(drop=True)
|
||
|
expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
|
||
|
assert_frame_equal(results, expected)
|
||
|
|
||
|
results = pd.concat([df1, df3]).reset_index(drop=True)
|
||
|
expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
|
||
|
assert_frame_equal(results, expected)
|
||
|
|
||
|
results = pd.concat([df2, df3]).reset_index(drop=True)
|
||
|
expected = DataFrame(dict(time=[ts2, ts3]))
|
||
|
assert_frame_equal(results, expected)
|
||
|
|
||
|
def test_concat_tuple_keys(self):
|
||
|
# GH 14438
|
||
|
df1 = pd.DataFrame(np.ones((2, 2)), columns=list('AB'))
|
||
|
df2 = pd.DataFrame(np.ones((3, 2)) * 2, columns=list('AB'))
|
||
|
results = pd.concat((df1, df2), keys=[('bee', 'bah'), ('bee', 'boo')])
|
||
|
expected = pd.DataFrame(
|
||
|
{'A': {('bee', 'bah', 0): 1.0,
|
||
|
('bee', 'bah', 1): 1.0,
|
||
|
('bee', 'boo', 0): 2.0,
|
||
|
('bee', 'boo', 1): 2.0,
|
||
|
('bee', 'boo', 2): 2.0},
|
||
|
'B': {('bee', 'bah', 0): 1.0,
|
||
|
('bee', 'bah', 1): 1.0,
|
||
|
('bee', 'boo', 0): 2.0,
|
||
|
('bee', 'boo', 1): 2.0,
|
||
|
('bee', 'boo', 2): 2.0}})
|
||
|
assert_frame_equal(results, expected)
|
||
|
|
||
|
def test_append_series_dict(self):
|
||
|
df = DataFrame(np.random.randn(5, 4),
|
||
|
columns=['foo', 'bar', 'baz', 'qux'])
|
||
|
|
||
|
series = df.loc[4]
|
||
|
with tm.assert_raises_regex(ValueError,
|
||
|
'Indexes have overlapping values'):
|
||
|
df.append(series, verify_integrity=True)
|
||
|
series.name = None
|
||
|
with tm.assert_raises_regex(TypeError,
|
||
|
'Can only append a Series if '
|
||
|
'ignore_index=True'):
|
||
|
df.append(series, verify_integrity=True)
|
||
|
|
||
|
result = df.append(series[::-1], ignore_index=True)
|
||
|
expected = df.append(DataFrame({0: series[::-1]}, index=df.columns).T,
|
||
|
ignore_index=True)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# dict
|
||
|
result = df.append(series.to_dict(), ignore_index=True)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.append(series[::-1][:3], ignore_index=True)
|
||
|
expected = df.append(DataFrame({0: series[::-1][:3]}).T,
|
||
|
ignore_index=True, sort=True)
|
||
|
assert_frame_equal(result, expected.loc[:, result.columns])
|
||
|
|
||
|
# can append when name set
|
||
|
row = df.loc[4]
|
||
|
row.name = 5
|
||
|
result = df.append(row)
|
||
|
expected = df.append(df[-1:], ignore_index=True)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_append_list_of_series_dicts(self):
|
||
|
df = DataFrame(np.random.randn(5, 4),
|
||
|
columns=['foo', 'bar', 'baz', 'qux'])
|
||
|
|
||
|
dicts = [x.to_dict() for idx, x in df.iterrows()]
|
||
|
|
||
|
result = df.append(dicts, ignore_index=True)
|
||
|
expected = df.append(df, ignore_index=True)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# different columns
|
||
|
dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
|
||
|
{'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
|
||
|
result = df.append(dicts, ignore_index=True, sort=True)
|
||
|
expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_append_empty_dataframe(self):
|
||
|
|
||
|
# Empty df append empty df
|
||
|
df1 = DataFrame([])
|
||
|
df2 = DataFrame([])
|
||
|
result = df1.append(df2)
|
||
|
expected = df1.copy()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# Non-empty df append empty df
|
||
|
df1 = DataFrame(np.random.randn(5, 2))
|
||
|
df2 = DataFrame()
|
||
|
result = df1.append(df2)
|
||
|
expected = df1.copy()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# Empty df with columns append empty df
|
||
|
df1 = DataFrame(columns=['bar', 'foo'])
|
||
|
df2 = DataFrame()
|
||
|
result = df1.append(df2)
|
||
|
expected = df1.copy()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# Non-Empty df with columns append empty df
|
||
|
df1 = DataFrame(np.random.randn(5, 2), columns=['bar', 'foo'])
|
||
|
df2 = DataFrame()
|
||
|
result = df1.append(df2)
|
||
|
expected = df1.copy()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_append_dtypes(self):
|
||
|
|
||
|
# GH 5754
|
||
|
# row appends of different dtypes (so need to do by-item)
|
||
|
# can sometimes infer the correct type
|
||
|
|
||
|
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(5))
|
||
|
df2 = DataFrame()
|
||
|
result = df1.append(df2)
|
||
|
expected = df1.copy()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||
|
df2 = DataFrame({'bar': 'foo'}, index=lrange(1, 2))
|
||
|
result = df1.append(df2)
|
||
|
expected = DataFrame({'bar': [Timestamp('20130101'), 'foo']})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||
|
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2))
|
||
|
result = df1.append(df2)
|
||
|
expected = DataFrame(
|
||
|
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||
|
df2 = DataFrame({'bar': np.nan}, index=lrange(1, 2), dtype=object)
|
||
|
result = df1.append(df2)
|
||
|
expected = DataFrame(
|
||
|
{'bar': Series([Timestamp('20130101'), np.nan], dtype='M8[ns]')})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df1 = DataFrame({'bar': np.nan}, index=lrange(1))
|
||
|
df2 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1, 2))
|
||
|
result = df1.append(df2)
|
||
|
expected = DataFrame(
|
||
|
{'bar': Series([np.nan, Timestamp('20130101')], dtype='M8[ns]')})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df1 = DataFrame({'bar': Timestamp('20130101')}, index=lrange(1))
|
||
|
df2 = DataFrame({'bar': 1}, index=lrange(1, 2), dtype=object)
|
||
|
result = df1.append(df2)
|
||
|
expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_update(self):
|
||
|
df = DataFrame([[1.5, nan, 3.],
|
||
|
[1.5, nan, 3.],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 3]])
|
||
|
|
||
|
other = DataFrame([[3.6, 2., np.nan],
|
||
|
[np.nan, np.nan, 7]], index=[1, 3])
|
||
|
|
||
|
df.update(other)
|
||
|
|
||
|
expected = DataFrame([[1.5, nan, 3],
|
||
|
[3.6, 2, 3],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 7.]])
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_update_dtypes(self):
|
||
|
|
||
|
# gh 3016
|
||
|
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
|
||
|
columns=['A', 'B', 'bool1', 'bool2'])
|
||
|
|
||
|
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
|
||
|
df.update(other)
|
||
|
|
||
|
expected = DataFrame([[45., 45., False, True], [4., 5., True, False]],
|
||
|
columns=['A', 'B', 'bool1', 'bool2'])
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_update_nooverwrite(self):
|
||
|
df = DataFrame([[1.5, nan, 3.],
|
||
|
[1.5, nan, 3.],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 3]])
|
||
|
|
||
|
other = DataFrame([[3.6, 2., np.nan],
|
||
|
[np.nan, np.nan, 7]], index=[1, 3])
|
||
|
|
||
|
df.update(other, overwrite=False)
|
||
|
|
||
|
expected = DataFrame([[1.5, nan, 3],
|
||
|
[1.5, 2, 3],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 3.]])
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_update_filtered(self):
|
||
|
df = DataFrame([[1.5, nan, 3.],
|
||
|
[1.5, nan, 3.],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 3]])
|
||
|
|
||
|
other = DataFrame([[3.6, 2., np.nan],
|
||
|
[np.nan, np.nan, 7]], index=[1, 3])
|
||
|
|
||
|
df.update(other, filter_func=lambda x: x > 2)
|
||
|
|
||
|
expected = DataFrame([[1.5, nan, 3],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 7.]])
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_update_raise(self):
|
||
|
df = DataFrame([[1.5, 1, 3.],
|
||
|
[1.5, nan, 3.],
|
||
|
[1.5, nan, 3],
|
||
|
[1.5, nan, 3]])
|
||
|
|
||
|
other = DataFrame([[2., nan],
|
||
|
[nan, 7]], index=[1, 3], columns=[1, 2])
|
||
|
with tm.assert_raises_regex(ValueError, "Data overlaps"):
|
||
|
df.update(other, raise_conflict=True)
|
||
|
|
||
|
def test_update_from_non_df(self):
|
||
|
d = {'a': Series([1, 2, 3, 4]), 'b': Series([5, 6, 7, 8])}
|
||
|
df = DataFrame(d)
|
||
|
|
||
|
d['a'] = Series([5, 6, 7, 8])
|
||
|
df.update(d)
|
||
|
|
||
|
expected = DataFrame(d)
|
||
|
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
d = {'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}
|
||
|
df = DataFrame(d)
|
||
|
|
||
|
d['a'] = [5, 6, 7, 8]
|
||
|
df.update(d)
|
||
|
|
||
|
expected = DataFrame(d)
|
||
|
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_join_str_datetime(self):
|
||
|
str_dates = ['20120209', '20120222']
|
||
|
dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
|
||
|
|
||
|
A = DataFrame(str_dates, index=lrange(2), columns=['aa'])
|
||
|
C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
|
||
|
|
||
|
tst = A.join(C, on='aa')
|
||
|
|
||
|
assert len(tst.columns) == 3
|
||
|
|
||
|
def test_join_multiindex_leftright(self):
|
||
|
# GH 10741
|
||
|
df1 = (pd.DataFrame([['a', 'x', 0.471780], ['a', 'y', 0.774908],
|
||
|
['a', 'z', 0.563634], ['b', 'x', -0.353756],
|
||
|
['b', 'y', 0.368062], ['b', 'z', -1.721840],
|
||
|
['c', 'x', 1], ['c', 'y', 2], ['c', 'z', 3]],
|
||
|
columns=['first', 'second', 'value1'])
|
||
|
.set_index(['first', 'second']))
|
||
|
|
||
|
df2 = (pd.DataFrame([['a', 10], ['b', 20]],
|
||
|
columns=['first', 'value2'])
|
||
|
.set_index(['first']))
|
||
|
|
||
|
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
|
||
|
[-0.353756, 20], [0.368062, 20],
|
||
|
[-1.721840, 20],
|
||
|
[1.000000, np.nan], [2.000000, np.nan],
|
||
|
[3.000000, np.nan]],
|
||
|
index=df1.index, columns=['value1', 'value2'])
|
||
|
|
||
|
# these must be the same results (but columns are flipped)
|
||
|
assert_frame_equal(df1.join(df2, how='left'), exp)
|
||
|
assert_frame_equal(df2.join(df1, how='right'),
|
||
|
exp[['value2', 'value1']])
|
||
|
|
||
|
exp_idx = pd.MultiIndex.from_product([['a', 'b'], ['x', 'y', 'z']],
|
||
|
names=['first', 'second'])
|
||
|
exp = pd.DataFrame([[0.471780, 10], [0.774908, 10], [0.563634, 10],
|
||
|
[-0.353756, 20], [0.368062, 20], [-1.721840, 20]],
|
||
|
index=exp_idx, columns=['value1', 'value2'])
|
||
|
|
||
|
assert_frame_equal(df1.join(df2, how='right'), exp)
|
||
|
assert_frame_equal(df2.join(df1, how='left'),
|
||
|
exp[['value2', 'value1']])
|
||
|
|
||
|
def test_concat_named_keys(self):
|
||
|
# GH 14252
|
||
|
df = pd.DataFrame({'foo': [1, 2], 'bar': [0.1, 0.2]})
|
||
|
index = Index(['a', 'b'], name='baz')
|
||
|
concatted_named_from_keys = pd.concat([df, df], keys=index)
|
||
|
expected_named = pd.DataFrame(
|
||
|
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
|
||
|
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
|
||
|
names=['baz', None]))
|
||
|
assert_frame_equal(concatted_named_from_keys, expected_named)
|
||
|
|
||
|
index_no_name = Index(['a', 'b'], name=None)
|
||
|
concatted_named_from_names = pd.concat(
|
||
|
[df, df], keys=index_no_name, names=['baz'])
|
||
|
assert_frame_equal(concatted_named_from_names, expected_named)
|
||
|
|
||
|
concatted_unnamed = pd.concat([df, df], keys=index_no_name)
|
||
|
expected_unnamed = pd.DataFrame(
|
||
|
{'foo': [1, 2, 1, 2], 'bar': [0.1, 0.2, 0.1, 0.2]},
|
||
|
index=pd.MultiIndex.from_product((['a', 'b'], [0, 1]),
|
||
|
names=[None, None]))
|
||
|
assert_frame_equal(concatted_unnamed, expected_unnamed)
|
||
|
|
||
|
def test_concat_axis_parameter(self):
|
||
|
# GH 14369
|
||
|
df1 = pd.DataFrame({'A': [0.1, 0.2]}, index=range(2))
|
||
|
df2 = pd.DataFrame({'A': [0.3, 0.4]}, index=range(2))
|
||
|
|
||
|
# Index/row/0 DataFrame
|
||
|
expected_index = pd.DataFrame(
|
||
|
{'A': [0.1, 0.2, 0.3, 0.4]}, index=[0, 1, 0, 1])
|
||
|
|
||
|
concatted_index = pd.concat([df1, df2], axis='index')
|
||
|
assert_frame_equal(concatted_index, expected_index)
|
||
|
|
||
|
concatted_row = pd.concat([df1, df2], axis='rows')
|
||
|
assert_frame_equal(concatted_row, expected_index)
|
||
|
|
||
|
concatted_0 = pd.concat([df1, df2], axis=0)
|
||
|
assert_frame_equal(concatted_0, expected_index)
|
||
|
|
||
|
# Columns/1 DataFrame
|
||
|
expected_columns = pd.DataFrame(
|
||
|
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=['A', 'A'])
|
||
|
|
||
|
concatted_columns = pd.concat([df1, df2], axis='columns')
|
||
|
assert_frame_equal(concatted_columns, expected_columns)
|
||
|
|
||
|
concatted_1 = pd.concat([df1, df2], axis=1)
|
||
|
assert_frame_equal(concatted_1, expected_columns)
|
||
|
|
||
|
series1 = pd.Series([0.1, 0.2])
|
||
|
series2 = pd.Series([0.3, 0.4])
|
||
|
|
||
|
# Index/row/0 Series
|
||
|
expected_index_series = pd.Series(
|
||
|
[0.1, 0.2, 0.3, 0.4], index=[0, 1, 0, 1])
|
||
|
|
||
|
concatted_index_series = pd.concat([series1, series2], axis='index')
|
||
|
assert_series_equal(concatted_index_series, expected_index_series)
|
||
|
|
||
|
concatted_row_series = pd.concat([series1, series2], axis='rows')
|
||
|
assert_series_equal(concatted_row_series, expected_index_series)
|
||
|
|
||
|
concatted_0_series = pd.concat([series1, series2], axis=0)
|
||
|
assert_series_equal(concatted_0_series, expected_index_series)
|
||
|
|
||
|
# Columns/1 Series
|
||
|
expected_columns_series = pd.DataFrame(
|
||
|
[[0.1, 0.3], [0.2, 0.4]], index=[0, 1], columns=[0, 1])
|
||
|
|
||
|
concatted_columns_series = pd.concat(
|
||
|
[series1, series2], axis='columns')
|
||
|
assert_frame_equal(concatted_columns_series, expected_columns_series)
|
||
|
|
||
|
concatted_1_series = pd.concat([series1, series2], axis=1)
|
||
|
assert_frame_equal(concatted_1_series, expected_columns_series)
|
||
|
|
||
|
# Testing ValueError
|
||
|
with tm.assert_raises_regex(ValueError, 'No axis named'):
|
||
|
pd.concat([series1, series2], axis='something')
|
||
|
|
||
|
def test_concat_numerical_names(self):
|
||
|
# #15262 # #12223
|
||
|
df = pd.DataFrame({'col': range(9)},
|
||
|
dtype='int32',
|
||
|
index=(pd.MultiIndex
|
||
|
.from_product([['A0', 'A1', 'A2'],
|
||
|
['B0', 'B1', 'B2']],
|
||
|
names=[1, 2])))
|
||
|
result = pd.concat((df.iloc[:2, :], df.iloc[-2:, :]))
|
||
|
expected = pd.DataFrame({'col': [0, 1, 7, 8]},
|
||
|
dtype='int32',
|
||
|
index=pd.MultiIndex.from_tuples([('A0', 'B0'),
|
||
|
('A0', 'B1'),
|
||
|
('A2', 'B1'),
|
||
|
('A2', 'B2')],
|
||
|
names=[1, 2]))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestDataFrameCombineFirst(TestData):
|
||
|
|
||
|
def test_combine_first_mixed(self):
|
||
|
a = Series(['a', 'b'], index=lrange(2))
|
||
|
b = Series(lrange(2), index=lrange(2))
|
||
|
f = DataFrame({'A': a, 'B': b})
|
||
|
|
||
|
a = Series(['a', 'b'], index=lrange(5, 7))
|
||
|
b = Series(lrange(2), index=lrange(5, 7))
|
||
|
g = DataFrame({'A': a, 'B': b})
|
||
|
|
||
|
exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]},
|
||
|
index=[0, 1, 5, 6])
|
||
|
combined = f.combine_first(g)
|
||
|
tm.assert_frame_equal(combined, exp)
|
||
|
|
||
|
def test_combine_first(self):
|
||
|
# disjoint
|
||
|
head, tail = self.frame[:5], self.frame[5:]
|
||
|
|
||
|
combined = head.combine_first(tail)
|
||
|
reordered_frame = self.frame.reindex(combined.index)
|
||
|
assert_frame_equal(combined, reordered_frame)
|
||
|
assert tm.equalContents(combined.columns, self.frame.columns)
|
||
|
assert_series_equal(combined['A'], reordered_frame['A'])
|
||
|
|
||
|
# same index
|
||
|
fcopy = self.frame.copy()
|
||
|
fcopy['A'] = 1
|
||
|
del fcopy['C']
|
||
|
|
||
|
fcopy2 = self.frame.copy()
|
||
|
fcopy2['B'] = 0
|
||
|
del fcopy2['D']
|
||
|
|
||
|
combined = fcopy.combine_first(fcopy2)
|
||
|
|
||
|
assert (combined['A'] == 1).all()
|
||
|
assert_series_equal(combined['B'], fcopy['B'])
|
||
|
assert_series_equal(combined['C'], fcopy2['C'])
|
||
|
assert_series_equal(combined['D'], fcopy['D'])
|
||
|
|
||
|
# overlap
|
||
|
head, tail = reordered_frame[:10].copy(), reordered_frame
|
||
|
head['A'] = 1
|
||
|
|
||
|
combined = head.combine_first(tail)
|
||
|
assert (combined['A'][:10] == 1).all()
|
||
|
|
||
|
# reverse overlap
|
||
|
tail['A'][:10] = 0
|
||
|
combined = tail.combine_first(head)
|
||
|
assert (combined['A'][:10] == 0).all()
|
||
|
|
||
|
# no overlap
|
||
|
f = self.frame[:10]
|
||
|
g = self.frame[10:]
|
||
|
combined = f.combine_first(g)
|
||
|
assert_series_equal(combined['A'].reindex(f.index), f['A'])
|
||
|
assert_series_equal(combined['A'].reindex(g.index), g['A'])
|
||
|
|
||
|
# corner cases
|
||
|
comb = self.frame.combine_first(self.empty)
|
||
|
assert_frame_equal(comb, self.frame)
|
||
|
|
||
|
comb = self.empty.combine_first(self.frame)
|
||
|
assert_frame_equal(comb, self.frame)
|
||
|
|
||
|
comb = self.frame.combine_first(DataFrame(index=["faz", "boo"]))
|
||
|
assert "faz" in comb.index
|
||
|
|
||
|
# #2525
|
||
|
df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)])
|
||
|
df2 = DataFrame({}, columns=['b'])
|
||
|
result = df.combine_first(df2)
|
||
|
assert 'b' in result
|
||
|
|
||
|
def test_combine_first_mixed_bug(self):
|
||
|
idx = Index(['a', 'b', 'c', 'e'])
|
||
|
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
|
||
|
ser2 = Series(['a', 'b', 'c', 'e'], index=idx)
|
||
|
ser3 = Series([12, 4, 5, 97], index=idx)
|
||
|
|
||
|
frame1 = DataFrame({"col0": ser1,
|
||
|
"col2": ser2,
|
||
|
"col3": ser3})
|
||
|
|
||
|
idx = Index(['a', 'b', 'c', 'f'])
|
||
|
ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx)
|
||
|
ser2 = Series(['a', 'b', 'c', 'f'], index=idx)
|
||
|
ser3 = Series([12, 4, 5, 97], index=idx)
|
||
|
|
||
|
frame2 = DataFrame({"col1": ser1,
|
||
|
"col2": ser2,
|
||
|
"col5": ser3})
|
||
|
|
||
|
combined = frame1.combine_first(frame2)
|
||
|
assert len(combined.columns) == 5
|
||
|
|
||
|
# gh 3016 (same as in update)
|
||
|
df = DataFrame([[1., 2., False, True], [4., 5., True, False]],
|
||
|
columns=['A', 'B', 'bool1', 'bool2'])
|
||
|
|
||
|
other = DataFrame([[45, 45]], index=[0], columns=['A', 'B'])
|
||
|
result = df.combine_first(other)
|
||
|
assert_frame_equal(result, df)
|
||
|
|
||
|
df.loc[0, 'A'] = np.nan
|
||
|
result = df.combine_first(other)
|
||
|
df.loc[0, 'A'] = 45
|
||
|
assert_frame_equal(result, df)
|
||
|
|
||
|
# doc example
|
||
|
df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan],
|
||
|
'B': [np.nan, 2., 3., np.nan, 6.]})
|
||
|
|
||
|
df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
|
||
|
'B': [np.nan, np.nan, 3., 4., 6., 8.]})
|
||
|
|
||
|
result = df1.combine_first(df2)
|
||
|
expected = DataFrame(
|
||
|
{'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# GH3552, return object dtype with bools
|
||
|
df1 = DataFrame(
|
||
|
[[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]])
|
||
|
df2 = DataFrame(
|
||
|
[[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2])
|
||
|
|
||
|
result = df1.combine_first(df2)[2]
|
||
|
expected = Series([True, True, False], name=2)
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
# GH 3593, converting datetime64[ns] incorrecly
|
||
|
df0 = DataFrame({"a": [datetime(2000, 1, 1),
|
||
|
datetime(2000, 1, 2),
|
||
|
datetime(2000, 1, 3)]})
|
||
|
df1 = DataFrame({"a": [None, None, None]})
|
||
|
df2 = df1.combine_first(df0)
|
||
|
assert_frame_equal(df2, df0)
|
||
|
|
||
|
df2 = df0.combine_first(df1)
|
||
|
assert_frame_equal(df2, df0)
|
||
|
|
||
|
df0 = DataFrame({"a": [datetime(2000, 1, 1),
|
||
|
datetime(2000, 1, 2),
|
||
|
datetime(2000, 1, 3)]})
|
||
|
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
|
||
|
df2 = df1.combine_first(df0)
|
||
|
result = df0.copy()
|
||
|
result.iloc[0, :] = df1.iloc[0, :]
|
||
|
assert_frame_equal(df2, result)
|
||
|
|
||
|
df2 = df0.combine_first(df1)
|
||
|
assert_frame_equal(df2, df0)
|
||
|
|
||
|
def test_combine_first_align_nan(self):
|
||
|
# GH 7509 (not fixed)
|
||
|
dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]],
|
||
|
columns=['a', 'b'])
|
||
|
dfb = pd.DataFrame([[4], [5]], columns=['b'])
|
||
|
assert dfa['a'].dtype == 'datetime64[ns]'
|
||
|
assert dfa['b'].dtype == 'int64'
|
||
|
|
||
|
res = dfa.combine_first(dfb)
|
||
|
exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT],
|
||
|
'b': [2., 5.]}, columns=['a', 'b'])
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
assert res['a'].dtype == 'datetime64[ns]'
|
||
|
# ToDo: this must be int64
|
||
|
assert res['b'].dtype == 'float64'
|
||
|
|
||
|
res = dfa.iloc[:0].combine_first(dfb)
|
||
|
exp = pd.DataFrame({'a': [np.nan, np.nan],
|
||
|
'b': [4, 5]}, columns=['a', 'b'])
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
# ToDo: this must be datetime64
|
||
|
assert res['a'].dtype == 'float64'
|
||
|
# ToDo: this must be int64
|
||
|
assert res['b'].dtype == 'int64'
|
||
|
|
||
|
def test_combine_first_timezone(self):
|
||
|
# see gh-7630
|
||
|
data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC')
|
||
|
df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'],
|
||
|
data=data1,
|
||
|
index=pd.date_range('20140627', periods=1))
|
||
|
data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC')
|
||
|
df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'],
|
||
|
data=data2,
|
||
|
index=pd.date_range('20140628', periods=1))
|
||
|
res = df2[['UTCdatetime']].combine_first(df1)
|
||
|
exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01',
|
||
|
tz='UTC'),
|
||
|
pd.Timestamp('2012-12-12 12:12',
|
||
|
tz='UTC')],
|
||
|
'abc': [pd.Timestamp('2010-01-01 01:01:00',
|
||
|
tz='UTC'), pd.NaT]},
|
||
|
columns=['UTCdatetime', 'abc'],
|
||
|
index=pd.date_range('20140627', periods=2,
|
||
|
freq='D'))
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
assert res['UTCdatetime'].dtype == 'datetime64[ns, UTC]'
|
||
|
assert res['abc'].dtype == 'datetime64[ns, UTC]'
|
||
|
|
||
|
# see gh-10567
|
||
|
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC')
|
||
|
df1 = pd.DataFrame({'DATE': dts1})
|
||
|
dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC')
|
||
|
df2 = pd.DataFrame({'DATE': dts2})
|
||
|
|
||
|
res = df1.combine_first(df2)
|
||
|
tm.assert_frame_equal(res, df1)
|
||
|
assert res['DATE'].dtype == 'datetime64[ns, UTC]'
|
||
|
|
||
|
dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03',
|
||
|
'2011-01-04'], tz='US/Eastern')
|
||
|
df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7])
|
||
|
dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02',
|
||
|
'2012-01-03'], tz='US/Eastern')
|
||
|
df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5])
|
||
|
|
||
|
res = df1.combine_first(df2)
|
||
|
exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT',
|
||
|
'2012-01-02', '2011-01-03', '2011-01-04'],
|
||
|
tz='US/Eastern')
|
||
|
exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
|
||
|
# different tz
|
||
|
dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern')
|
||
|
df1 = pd.DataFrame({'DATE': dts1})
|
||
|
dts2 = pd.date_range('2015-01-03', '2015-01-05')
|
||
|
df2 = pd.DataFrame({'DATE': dts2})
|
||
|
|
||
|
# if df1 doesn't have NaN, keep its dtype
|
||
|
res = df1.combine_first(df2)
|
||
|
tm.assert_frame_equal(res, df1)
|
||
|
assert res['DATE'].dtype == 'datetime64[ns, US/Eastern]'
|
||
|
|
||
|
dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern')
|
||
|
df1 = pd.DataFrame({'DATE': dts1})
|
||
|
dts2 = pd.date_range('2015-01-01', '2015-01-03')
|
||
|
df2 = pd.DataFrame({'DATE': dts2})
|
||
|
|
||
|
res = df1.combine_first(df2)
|
||
|
exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'),
|
||
|
pd.Timestamp('2015-01-02', tz='US/Eastern'),
|
||
|
pd.Timestamp('2015-01-03')]
|
||
|
exp = pd.DataFrame({'DATE': exp_dts})
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
assert res['DATE'].dtype == 'object'
|
||
|
|
||
|
def test_combine_first_timedelta(self):
|
||
|
data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day'])
|
||
|
df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7])
|
||
|
data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day'])
|
||
|
df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5])
|
||
|
|
||
|
res = df1.combine_first(df2)
|
||
|
exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT',
|
||
|
'11 day', '3 day', '4 day'])
|
||
|
exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
assert res['TD'].dtype == 'timedelta64[ns]'
|
||
|
|
||
|
def test_combine_first_period(self):
|
||
|
data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03',
|
||
|
'2011-04'], freq='M')
|
||
|
df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7])
|
||
|
data2 = pd.PeriodIndex(['2012-01-01', '2012-02',
|
||
|
'2012-03'], freq='M')
|
||
|
df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5])
|
||
|
|
||
|
res = df1.combine_first(df2)
|
||
|
exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT',
|
||
|
'2012-02', '2011-03', '2011-04'],
|
||
|
freq='M')
|
||
|
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
assert res['P'].dtype == 'object'
|
||
|
|
||
|
# different freq
|
||
|
dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02',
|
||
|
'2012-01-03'], freq='D')
|
||
|
df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5])
|
||
|
|
||
|
res = df1.combine_first(df2)
|
||
|
exp_dts = [pd.Period('2011-01', freq='M'),
|
||
|
pd.Period('2012-01-01', freq='D'),
|
||
|
pd.NaT,
|
||
|
pd.Period('2012-01-02', freq='D'),
|
||
|
pd.Period('2011-03', freq='M'),
|
||
|
pd.Period('2011-04', freq='M')]
|
||
|
exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||
|
tm.assert_frame_equal(res, exp)
|
||
|
assert res['P'].dtype == 'object'
|
||
|
|
||
|
def test_combine_first_int(self):
|
||
|
# GH14687 - integer series that do no align exactly
|
||
|
|
||
|
df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64')
|
||
|
df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64')
|
||
|
|
||
|
res = df1.combine_first(df2)
|
||
|
tm.assert_frame_equal(res, df1)
|
||
|
assert res['a'].dtype == 'int64'
|
||
|
|
||
|
def test_concat_datetime_datetime64_frame(self):
|
||
|
# #2624
|
||
|
rows = []
|
||
|
rows.append([datetime(2010, 1, 1), 1])
|
||
|
rows.append([datetime(2010, 1, 2), 'hi'])
|
||
|
|
||
|
df2_obj = DataFrame.from_records(rows, columns=['date', 'test'])
|
||
|
|
||
|
ind = date_range(start="2000/1/1", freq="D", periods=10)
|
||
|
df1 = DataFrame({'date': ind, 'test': lrange(10)})
|
||
|
|
||
|
# it works!
|
||
|
pd.concat([df1, df2_obj])
|
||
|
|
||
|
|
||
|
class TestDataFrameUpdate(TestData):
|
||
|
|
||
|
def test_update_nan(self):
|
||
|
# #15593 #15617
|
||
|
# test 1
|
||
|
df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)})
|
||
|
df2 = DataFrame({'A': [None, 2, 3]})
|
||
|
expected = df1.copy()
|
||
|
df1.update(df2, overwrite=False)
|
||
|
|
||
|
tm.assert_frame_equal(df1, expected)
|
||
|
|
||
|
# test 2
|
||
|
df1 = DataFrame({'A': [1.0, None, 3],
|
||
|
'B': date_range('2000', periods=3)})
|
||
|
df2 = DataFrame({'A': [None, 2, 3]})
|
||
|
expected = DataFrame({'A': [1.0, 2, 3],
|
||
|
'B': date_range('2000', periods=3)})
|
||
|
df1.update(df2, overwrite=False)
|
||
|
|
||
|
tm.assert_frame_equal(df1, expected)
|