478 lines
18 KiB
Python
478 lines
18 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import print_function
|
|
|
|
import pytest
|
|
import numpy as np
|
|
|
|
from pandas.compat import lrange, u
|
|
from pandas import DataFrame, Series, MultiIndex, date_range
|
|
import pandas as pd
|
|
|
|
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
|
|
|
import pandas.util.testing as tm
|
|
|
|
from pandas.tests.frame.common import TestData
|
|
|
|
|
|
class TestDataFrameNonuniqueIndexes(TestData):
|
|
|
|
def test_column_dups_operations(self):
|
|
|
|
def check(result, expected=None):
|
|
if expected is not None:
|
|
assert_frame_equal(result, expected)
|
|
result.dtypes
|
|
str(result)
|
|
|
|
# assignment
|
|
# GH 3687
|
|
arr = np.random.randn(3, 2)
|
|
idx = lrange(2)
|
|
df = DataFrame(arr, columns=['A', 'A'])
|
|
df.columns = idx
|
|
expected = DataFrame(arr, columns=idx)
|
|
check(df, expected)
|
|
|
|
idx = date_range('20130101', periods=4, freq='Q-NOV')
|
|
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
|
columns=['a', 'a', 'a', 'a'])
|
|
df.columns = idx
|
|
expected = DataFrame(
|
|
[[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
|
|
check(df, expected)
|
|
|
|
# insert
|
|
df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
|
|
columns=['foo', 'bar', 'foo', 'hello'])
|
|
df['string'] = 'bah'
|
|
expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
|
|
[2, 1, 3, 5, 'bah']],
|
|
columns=['foo', 'bar', 'foo', 'hello', 'string'])
|
|
check(df, expected)
|
|
with tm.assert_raises_regex(ValueError, 'Length of value'):
|
|
df.insert(0, 'AnotherColumn', range(len(df.index) - 1))
|
|
|
|
# insert same dtype
|
|
df['foo2'] = 3
|
|
expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
|
|
[2, 1, 3, 5, 'bah', 3]],
|
|
columns=['foo', 'bar', 'foo', 'hello',
|
|
'string', 'foo2'])
|
|
check(df, expected)
|
|
|
|
# set (non-dup)
|
|
df['foo2'] = 4
|
|
expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
|
|
[2, 1, 3, 5, 'bah', 4]],
|
|
columns=['foo', 'bar', 'foo', 'hello',
|
|
'string', 'foo2'])
|
|
check(df, expected)
|
|
df['foo2'] = 3
|
|
|
|
# delete (non dup)
|
|
del df['bar']
|
|
expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
|
|
[2, 3, 5, 'bah', 3]],
|
|
columns=['foo', 'foo', 'hello', 'string', 'foo2'])
|
|
check(df, expected)
|
|
|
|
# try to delete again (its not consolidated)
|
|
del df['hello']
|
|
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
|
|
[2, 3, 'bah', 3]],
|
|
columns=['foo', 'foo', 'string', 'foo2'])
|
|
check(df, expected)
|
|
|
|
# consolidate
|
|
df = df._consolidate()
|
|
expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
|
|
[2, 3, 'bah', 3]],
|
|
columns=['foo', 'foo', 'string', 'foo2'])
|
|
check(df, expected)
|
|
|
|
# insert
|
|
df.insert(2, 'new_col', 5.)
|
|
expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
|
|
[2, 3, 5., 'bah', 3]],
|
|
columns=['foo', 'foo', 'new_col', 'string',
|
|
'foo2'])
|
|
check(df, expected)
|
|
|
|
# insert a dup
|
|
tm.assert_raises_regex(ValueError, 'cannot insert',
|
|
df.insert, 2, 'new_col', 4.)
|
|
df.insert(2, 'new_col', 4., allow_duplicates=True)
|
|
expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
|
|
[1, 2, 4., 5., 'bah', 3],
|
|
[2, 3, 4., 5., 'bah', 3]],
|
|
columns=['foo', 'foo', 'new_col',
|
|
'new_col', 'string', 'foo2'])
|
|
check(df, expected)
|
|
|
|
# delete (dup)
|
|
del df['foo']
|
|
expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
|
|
[4., 5., 'bah', 3]],
|
|
columns=['new_col', 'new_col', 'string', 'foo2'])
|
|
assert_frame_equal(df, expected)
|
|
|
|
# dup across dtypes
|
|
df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
|
|
columns=['foo', 'bar', 'foo', 'hello'])
|
|
check(df)
|
|
|
|
df['foo2'] = 7.
|
|
expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
|
|
[2, 1, 3., 5, 7.]],
|
|
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
|
|
check(df, expected)
|
|
|
|
result = df['foo']
|
|
expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
|
|
columns=['foo', 'foo'])
|
|
check(result, expected)
|
|
|
|
# multiple replacements
|
|
df['foo'] = 'string'
|
|
expected = DataFrame([['string', 1, 'string', 5, 7.],
|
|
['string', 1, 'string', 5, 7.],
|
|
['string', 1, 'string', 5, 7.]],
|
|
columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
|
|
check(df, expected)
|
|
|
|
del df['foo']
|
|
expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
|
|
'bar', 'hello', 'foo2'])
|
|
check(df, expected)
|
|
|
|
# values
|
|
df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
|
|
result = df.values
|
|
expected = np.array([[1, 2.5], [3, 4.5]])
|
|
assert (result == expected).all().all()
|
|
|
|
# rename, GH 4403
|
|
df4 = DataFrame(
|
|
{'RT': [0.0454],
|
|
'TClose': [22.02],
|
|
'TExg': [0.0422]},
|
|
index=MultiIndex.from_tuples([(600809, 20130331)],
|
|
names=['STK_ID', 'RPT_Date']))
|
|
|
|
df5 = DataFrame({'RPT_Date': [20120930, 20121231, 20130331],
|
|
'STK_ID': [600809] * 3,
|
|
'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
|
|
'TClose': [38.05, 41.66, 30.01]},
|
|
index=MultiIndex.from_tuples(
|
|
[(600809, 20120930),
|
|
(600809, 20121231),
|
|
(600809, 20130331)],
|
|
names=['STK_ID', 'RPT_Date']))
|
|
|
|
k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
|
|
result = k.rename(
|
|
columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
|
|
str(result)
|
|
result.dtypes
|
|
|
|
expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
|
|
u('饡驦'), 30.01]],
|
|
columns=['RT', 'TClose', 'TExg',
|
|
'RPT_Date', 'STK_ID', 'STK_Name',
|
|
'QT_Close'])
|
|
.set_index(['STK_ID', 'RPT_Date'], drop=False))
|
|
assert_frame_equal(result, expected)
|
|
|
|
# reindex is invalid!
|
|
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
|
|
columns=['bar', 'a', 'a'])
|
|
pytest.raises(ValueError, df.reindex, columns=['bar'])
|
|
pytest.raises(ValueError, df.reindex, columns=['bar', 'foo'])
|
|
|
|
# drop
|
|
df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
|
|
columns=['bar', 'a', 'a'])
|
|
result = df.drop(['a'], axis=1)
|
|
expected = DataFrame([[1], [1], [1]], columns=['bar'])
|
|
check(result, expected)
|
|
result = df.drop('a', axis=1)
|
|
check(result, expected)
|
|
|
|
# describe
|
|
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
|
columns=['bar', 'a', 'a'], dtype='float64')
|
|
result = df.describe()
|
|
s = df.iloc[:, 0].describe()
|
|
expected = pd.concat([s, s, s], keys=df.columns, axis=1)
|
|
check(result, expected)
|
|
|
|
# check column dups with index equal and not equal to df's index
|
|
df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
|
|
columns=['A', 'B', 'A'])
|
|
for index in [df.index, pd.Index(list('edcba'))]:
|
|
this_df = df.copy()
|
|
expected_ser = pd.Series(index.values, index=this_df.index)
|
|
expected_df = DataFrame({'A': expected_ser,
|
|
'B': this_df['B'],
|
|
'A': expected_ser},
|
|
columns=['A', 'B', 'A'])
|
|
this_df['A'] = index
|
|
check(this_df, expected_df)
|
|
|
|
# operations
|
|
for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
|
|
df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
|
|
expected = getattr(df, op)(df)
|
|
expected.columns = ['A', 'A']
|
|
df.columns = ['A', 'A']
|
|
result = getattr(df, op)(df)
|
|
check(result, expected)
|
|
|
|
# multiple assignments that change dtypes
|
|
# the location indexer is a slice
|
|
# GH 6120
|
|
df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
|
|
expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])
|
|
|
|
df['that'] = 1.0
|
|
check(df, expected)
|
|
|
|
df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
|
|
expected = DataFrame(1, index=range(5), columns=['that', 'that'])
|
|
|
|
df['that'] = 1
|
|
check(df, expected)
|
|
|
|
def test_column_dups2(self):
|
|
|
|
# drop buggy GH 6240
|
|
df = DataFrame({'A': np.random.randn(5),
|
|
'B': np.random.randn(5),
|
|
'C': np.random.randn(5),
|
|
'D': ['a', 'b', 'c', 'd', 'e']})
|
|
|
|
expected = df.take([0, 1, 1], axis=1)
|
|
df2 = df.take([2, 0, 1, 2, 1], axis=1)
|
|
result = df2.drop('C', axis=1)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# dropna
|
|
df = DataFrame({'A': np.random.randn(5),
|
|
'B': np.random.randn(5),
|
|
'C': np.random.randn(5),
|
|
'D': ['a', 'b', 'c', 'd', 'e']})
|
|
df.iloc[2, [0, 1, 2]] = np.nan
|
|
df.iloc[0, 0] = np.nan
|
|
df.iloc[1, 1] = np.nan
|
|
df.iloc[:, 3] = np.nan
|
|
expected = df.dropna(subset=['A', 'B', 'C'], how='all')
|
|
expected.columns = ['A', 'A', 'B', 'C']
|
|
|
|
df.columns = ['A', 'A', 'B', 'C']
|
|
|
|
result = df.dropna(subset=['A', 'C'], how='all')
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_column_dups_indexing(self):
|
|
def check(result, expected=None):
|
|
if expected is not None:
|
|
assert_frame_equal(result, expected)
|
|
result.dtypes
|
|
str(result)
|
|
|
|
# boolean indexing
|
|
# GH 4879
|
|
dups = ['A', 'A', 'C', 'D']
|
|
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
|
|
'A', 'B', 'C', 'D'], dtype='float64')
|
|
expected = df[df.C > 6]
|
|
expected.columns = dups
|
|
df = DataFrame(np.arange(12).reshape(3, 4),
|
|
columns=dups, dtype='float64')
|
|
result = df[df.C > 6]
|
|
check(result, expected)
|
|
|
|
# where
|
|
df = DataFrame(np.arange(12).reshape(3, 4), columns=[
|
|
'A', 'B', 'C', 'D'], dtype='float64')
|
|
expected = df[df > 6]
|
|
expected.columns = dups
|
|
df = DataFrame(np.arange(12).reshape(3, 4),
|
|
columns=dups, dtype='float64')
|
|
result = df[df > 6]
|
|
check(result, expected)
|
|
|
|
# boolean with the duplicate raises
|
|
df = DataFrame(np.arange(12).reshape(3, 4),
|
|
columns=dups, dtype='float64')
|
|
pytest.raises(ValueError, lambda: df[df.A > 6])
|
|
|
|
# dup aligining operations should work
|
|
# GH 5185
|
|
df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
|
|
df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
|
|
expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
|
|
result = df1.sub(df2)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# equality
|
|
df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
|
|
columns=['A', 'B'])
|
|
df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
|
|
columns=['A', 'A'])
|
|
|
|
# not-comparing like-labelled
|
|
pytest.raises(ValueError, lambda: df1 == df2)
|
|
|
|
df1r = df1.reindex_like(df2)
|
|
result = df1r == df2
|
|
expected = DataFrame([[False, True], [True, False], [False, False], [
|
|
True, False]], columns=['A', 'A'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
# mixed column selection
|
|
# GH 5639
|
|
dfbool = DataFrame({'one': Series([True, True, False],
|
|
index=['a', 'b', 'c']),
|
|
'two': Series([False, False, True, False],
|
|
index=['a', 'b', 'c', 'd']),
|
|
'three': Series([False, True, True, True],
|
|
index=['a', 'b', 'c', 'd'])})
|
|
expected = pd.concat(
|
|
[dfbool['one'], dfbool['three'], dfbool['one']], axis=1)
|
|
result = dfbool[['one', 'three', 'one']]
|
|
check(result, expected)
|
|
|
|
# multi-axis dups
|
|
# GH 6121
|
|
df = DataFrame(np.arange(25.).reshape(5, 5),
|
|
index=['a', 'b', 'c', 'd', 'e'],
|
|
columns=['A', 'B', 'C', 'D', 'E'])
|
|
z = df[['A', 'C', 'A']].copy()
|
|
expected = z.loc[['a', 'c', 'a']]
|
|
|
|
df = DataFrame(np.arange(25.).reshape(5, 5),
|
|
index=['a', 'b', 'c', 'd', 'e'],
|
|
columns=['A', 'B', 'C', 'D', 'E'])
|
|
z = df[['A', 'C', 'A']]
|
|
result = z.loc[['a', 'c', 'a']]
|
|
check(result, expected)
|
|
|
|
def test_column_dups_indexing2(self):
|
|
|
|
# GH 8363
|
|
# datetime ops with a non-unique index
|
|
df = DataFrame({'A': np.arange(5, dtype='int64'),
|
|
'B': np.arange(1, 6, dtype='int64')},
|
|
index=[2, 2, 3, 3, 4])
|
|
result = df.B - df.A
|
|
expected = Series(1, index=[2, 2, 3, 3, 4])
|
|
assert_series_equal(result, expected)
|
|
|
|
df = DataFrame({'A': date_range('20130101', periods=5),
|
|
'B': date_range('20130101 09:00:00', periods=5)},
|
|
index=[2, 2, 3, 3, 4])
|
|
result = df.B - df.A
|
|
expected = Series(pd.Timedelta('9 hours'), index=[2, 2, 3, 3, 4])
|
|
assert_series_equal(result, expected)
|
|
|
|
def test_columns_with_dups(self):
|
|
# GH 3468 related
|
|
|
|
# basic
|
|
df = DataFrame([[1, 2]], columns=['a', 'a'])
|
|
df.columns = ['a', 'a.1']
|
|
str(df)
|
|
expected = DataFrame([[1, 2]], columns=['a', 'a.1'])
|
|
assert_frame_equal(df, expected)
|
|
|
|
df = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a'])
|
|
df.columns = ['b', 'a', 'a.1']
|
|
str(df)
|
|
expected = DataFrame([[1, 2, 3]], columns=['b', 'a', 'a.1'])
|
|
assert_frame_equal(df, expected)
|
|
|
|
# with a dup index
|
|
df = DataFrame([[1, 2]], columns=['a', 'a'])
|
|
df.columns = ['b', 'b']
|
|
str(df)
|
|
expected = DataFrame([[1, 2]], columns=['b', 'b'])
|
|
assert_frame_equal(df, expected)
|
|
|
|
# multi-dtype
|
|
df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
|
|
columns=['a', 'a', 'b', 'b', 'd', 'c', 'c'])
|
|
df.columns = list('ABCDEFG')
|
|
str(df)
|
|
expected = DataFrame(
|
|
[[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('ABCDEFG'))
|
|
assert_frame_equal(df, expected)
|
|
|
|
# this is an error because we cannot disambiguate the dup columns
|
|
pytest.raises(Exception, lambda x: DataFrame(
|
|
[[1, 2, 'foo', 'bar']], columns=['a', 'a', 'a', 'a']))
|
|
|
|
# dups across blocks
|
|
df_float = DataFrame(np.random.randn(10, 3), dtype='float64')
|
|
df_int = DataFrame(np.random.randn(10, 3), dtype='int64')
|
|
df_bool = DataFrame(True, index=df_float.index,
|
|
columns=df_float.columns)
|
|
df_object = DataFrame('foo', index=df_float.index,
|
|
columns=df_float.columns)
|
|
df_dt = DataFrame(pd.Timestamp('20010101'),
|
|
index=df_float.index,
|
|
columns=df_float.columns)
|
|
df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
|
|
|
|
assert len(df._data._blknos) == len(df.columns)
|
|
assert len(df._data._blklocs) == len(df.columns)
|
|
|
|
# testing iloc
|
|
for i in range(len(df.columns)):
|
|
df.iloc[:, i]
|
|
|
|
# dup columns across dtype GH 2079/2194
|
|
vals = [[1, -1, 2.], [2, -2, 3.]]
|
|
rs = DataFrame(vals, columns=['A', 'A', 'B'])
|
|
xp = DataFrame(vals)
|
|
xp.columns = ['A', 'A', 'B']
|
|
assert_frame_equal(rs, xp)
|
|
|
|
def test_values_duplicates(self):
|
|
df = DataFrame([[1, 2, 'a', 'b'],
|
|
[1, 2, 'a', 'b']],
|
|
columns=['one', 'one', 'two', 'two'])
|
|
|
|
result = df.values
|
|
expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']],
|
|
dtype=object)
|
|
|
|
tm.assert_numpy_array_equal(result, expected)
|
|
|
|
def test_set_value_by_index(self):
|
|
# See gh-12344
|
|
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
|
df.columns = list('AAA')
|
|
expected = df.iloc[:, 2]
|
|
|
|
df.iloc[:, 0] = 3
|
|
assert_series_equal(df.iloc[:, 2], expected)
|
|
|
|
df = DataFrame(np.arange(9).reshape(3, 3).T)
|
|
df.columns = [2, float(2), str(2)]
|
|
expected = df.iloc[:, 1]
|
|
|
|
df.iloc[:, 0] = 3
|
|
assert_series_equal(df.iloc[:, 1], expected)
|
|
|
|
def test_insert_with_columns_dups(self):
|
|
# GH 14291
|
|
df = pd.DataFrame()
|
|
df.insert(0, 'A', ['g', 'h', 'i'], allow_duplicates=True)
|
|
df.insert(0, 'A', ['d', 'e', 'f'], allow_duplicates=True)
|
|
df.insert(0, 'A', ['a', 'b', 'c'], allow_duplicates=True)
|
|
exp = pd.DataFrame([['a', 'd', 'g'], ['b', 'e', 'h'],
|
|
['c', 'f', 'i']], columns=['A', 'A', 'A'])
|
|
assert_frame_equal(df, exp)
|