284 lines
9.5 KiB
Python
284 lines
9.5 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from __future__ import print_function
|
||
|
import pytest
|
||
|
from pandas.compat import range, lrange
|
||
|
import numpy as np
|
||
|
from pandas.compat import PY36
|
||
|
|
||
|
from pandas import DataFrame, Series, Index, MultiIndex
|
||
|
|
||
|
from pandas.util.testing import assert_frame_equal
|
||
|
|
||
|
import pandas.util.testing as tm
|
||
|
|
||
|
from pandas.tests.frame.common import TestData
|
||
|
|
||
|
|
||
|
# Column add, remove, delete.
|
||
|
|
||
|
|
||
|
class TestDataFrameMutateColumns(TestData):
|
||
|
|
||
|
def test_assign(self):
|
||
|
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||
|
original = df.copy()
|
||
|
result = df.assign(C=df.B / df.A)
|
||
|
expected = df.copy()
|
||
|
expected['C'] = [4, 2.5, 2]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# lambda syntax
|
||
|
result = df.assign(C=lambda x: x.B / x.A)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# original is unmodified
|
||
|
assert_frame_equal(df, original)
|
||
|
|
||
|
# Non-Series array-like
|
||
|
result = df.assign(C=[4, 2.5, 2])
|
||
|
assert_frame_equal(result, expected)
|
||
|
# original is unmodified
|
||
|
assert_frame_equal(df, original)
|
||
|
|
||
|
result = df.assign(B=df.B / df.A)
|
||
|
expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# overwrite
|
||
|
result = df.assign(A=df.A + df.B)
|
||
|
expected = df.copy()
|
||
|
expected['A'] = [5, 7, 9]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# lambda
|
||
|
result = df.assign(A=lambda x: x.A + x.B)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_assign_multiple(self):
|
||
|
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B'])
|
||
|
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
|
||
|
expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5],
|
||
|
[3, 6, 9, 3, 6]], columns=list('ABCDE'))
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_assign_order(self):
|
||
|
# GH 9818
|
||
|
df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
|
||
|
result = df.assign(D=df.A + df.B, C=df.A - df.B)
|
||
|
|
||
|
if PY36:
|
||
|
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]],
|
||
|
columns=list('ABDC'))
|
||
|
else:
|
||
|
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
|
||
|
columns=list('ABCD'))
|
||
|
assert_frame_equal(result, expected)
|
||
|
result = df.assign(C=df.A - df.B, D=df.A + df.B)
|
||
|
|
||
|
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]],
|
||
|
columns=list('ABCD'))
|
||
|
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_assign_bad(self):
|
||
|
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||
|
|
||
|
# non-keyword argument
|
||
|
with pytest.raises(TypeError):
|
||
|
df.assign(lambda x: x.A)
|
||
|
with pytest.raises(AttributeError):
|
||
|
df.assign(C=df.A, D=df.A + df.C)
|
||
|
|
||
|
@pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
|
||
|
3.6 and above""")
|
||
|
def test_assign_dependent_old_python(self):
|
||
|
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
|
||
|
|
||
|
# Key C does not exist at definition time of df
|
||
|
with pytest.raises(KeyError):
|
||
|
df.assign(C=lambda df: df.A,
|
||
|
D=lambda df: df['A'] + df['C'])
|
||
|
with pytest.raises(KeyError):
|
||
|
df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
|
||
|
|
||
|
@pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
|
||
|
python 3.5 and below""")
|
||
|
def test_assign_dependent(self):
|
||
|
df = DataFrame({'A': [1, 2], 'B': [3, 4]})
|
||
|
|
||
|
result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
|
||
|
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
|
||
|
columns=list('ABCD'))
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.assign(C=lambda df: df.A,
|
||
|
D=lambda df: df['A'] + df['C'])
|
||
|
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
|
||
|
columns=list('ABCD'))
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_insert_error_msmgs(self):
|
||
|
|
||
|
# GH 7432
|
||
|
df = DataFrame({'foo': ['a', 'b', 'c'], 'bar': [
|
||
|
1, 2, 3], 'baz': ['d', 'e', 'f']}).set_index('foo')
|
||
|
s = DataFrame({'foo': ['a', 'b', 'c', 'a'], 'fiz': [
|
||
|
'g', 'h', 'i', 'j']}).set_index('foo')
|
||
|
msg = 'cannot reindex from a duplicate axis'
|
||
|
with tm.assert_raises_regex(ValueError, msg):
|
||
|
df['newcol'] = s
|
||
|
|
||
|
# GH 4107, more descriptive error message
|
||
|
df = DataFrame(np.random.randint(0, 2, (4, 4)),
|
||
|
columns=['a', 'b', 'c', 'd'])
|
||
|
|
||
|
msg = 'incompatible index of inserted column with frame index'
|
||
|
with tm.assert_raises_regex(TypeError, msg):
|
||
|
df['gr'] = df.groupby(['b', 'c']).count()
|
||
|
|
||
|
def test_insert_benchmark(self):
|
||
|
# from the vb_suite/frame_methods/frame_insert_columns
|
||
|
N = 10
|
||
|
K = 5
|
||
|
df = DataFrame(index=lrange(N))
|
||
|
new_col = np.random.randn(N)
|
||
|
for i in range(K):
|
||
|
df[i] = new_col
|
||
|
expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
|
||
|
index=lrange(N))
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_insert(self):
|
||
|
df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
|
||
|
columns=['c', 'b', 'a'])
|
||
|
|
||
|
df.insert(0, 'foo', df['a'])
|
||
|
tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
|
||
|
tm.assert_series_equal(df['a'], df['foo'], check_names=False)
|
||
|
|
||
|
df.insert(2, 'bar', df['c'])
|
||
|
tm.assert_index_equal(df.columns,
|
||
|
Index(['foo', 'c', 'bar', 'b', 'a']))
|
||
|
tm.assert_almost_equal(df['c'], df['bar'], check_names=False)
|
||
|
|
||
|
# diff dtype
|
||
|
|
||
|
# new item
|
||
|
df['x'] = df['a'].astype('float32')
|
||
|
result = Series(dict(float32=1, float64=5))
|
||
|
assert (df.get_dtype_counts().sort_index() == result).all()
|
||
|
|
||
|
# replacing current (in different block)
|
||
|
df['a'] = df['a'].astype('float32')
|
||
|
result = Series(dict(float32=2, float64=4))
|
||
|
assert (df.get_dtype_counts().sort_index() == result).all()
|
||
|
|
||
|
df['y'] = df['a'].astype('int32')
|
||
|
result = Series(dict(float32=2, float64=4, int32=1))
|
||
|
assert (df.get_dtype_counts().sort_index() == result).all()
|
||
|
|
||
|
with tm.assert_raises_regex(ValueError, 'already exists'):
|
||
|
df.insert(1, 'a', df['b'])
|
||
|
pytest.raises(ValueError, df.insert, 1, 'c', df['b'])
|
||
|
|
||
|
df.columns.name = 'some_name'
|
||
|
# preserve columns name field
|
||
|
df.insert(0, 'baz', df['c'])
|
||
|
assert df.columns.name == 'some_name'
|
||
|
|
||
|
# GH 13522
|
||
|
df = DataFrame(index=['A', 'B', 'C'])
|
||
|
df['X'] = df.index
|
||
|
df['X'] = ['x', 'y', 'z']
|
||
|
exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
|
||
|
assert_frame_equal(df, exp)
|
||
|
|
||
|
def test_delitem(self):
|
||
|
del self.frame['A']
|
||
|
assert 'A' not in self.frame
|
||
|
|
||
|
def test_delitem_multiindex(self):
|
||
|
midx = MultiIndex.from_product([['A', 'B'], [1, 2]])
|
||
|
df = DataFrame(np.random.randn(4, 4), columns=midx)
|
||
|
assert len(df.columns) == 4
|
||
|
assert ('A', ) in df.columns
|
||
|
assert 'A' in df.columns
|
||
|
|
||
|
result = df['A']
|
||
|
assert isinstance(result, DataFrame)
|
||
|
del df['A']
|
||
|
|
||
|
assert len(df.columns) == 2
|
||
|
|
||
|
# A still in the levels, BUT get a KeyError if trying
|
||
|
# to delete
|
||
|
assert ('A', ) not in df.columns
|
||
|
with pytest.raises(KeyError):
|
||
|
del df[('A',)]
|
||
|
|
||
|
# behavior of dropped/deleted MultiIndex levels changed from
|
||
|
# GH 2770 to GH 19027: MultiIndex no longer '.__contains__'
|
||
|
# levels which are dropped/deleted
|
||
|
assert 'A' not in df.columns
|
||
|
with pytest.raises(KeyError):
|
||
|
del df['A']
|
||
|
|
||
|
def test_pop(self):
|
||
|
self.frame.columns.name = 'baz'
|
||
|
|
||
|
self.frame.pop('A')
|
||
|
assert 'A' not in self.frame
|
||
|
|
||
|
self.frame['foo'] = 'bar'
|
||
|
self.frame.pop('foo')
|
||
|
assert 'foo' not in self.frame
|
||
|
# TODO assert self.frame.columns.name == 'baz'
|
||
|
|
||
|
# gh-10912: inplace ops cause caching issue
|
||
|
a = DataFrame([[1, 2, 3], [4, 5, 6]], columns=[
|
||
|
'A', 'B', 'C'], index=['X', 'Y'])
|
||
|
b = a.pop('B')
|
||
|
b += 1
|
||
|
|
||
|
# original frame
|
||
|
expected = DataFrame([[1, 3], [4, 6]], columns=[
|
||
|
'A', 'C'], index=['X', 'Y'])
|
||
|
tm.assert_frame_equal(a, expected)
|
||
|
|
||
|
# result
|
||
|
expected = Series([2, 5], index=['X', 'Y'], name='B') + 1
|
||
|
tm.assert_series_equal(b, expected)
|
||
|
|
||
|
def test_pop_non_unique_cols(self):
|
||
|
df = DataFrame({0: [0, 1], 1: [0, 1], 2: [4, 5]})
|
||
|
df.columns = ["a", "b", "a"]
|
||
|
|
||
|
res = df.pop("a")
|
||
|
assert type(res) == DataFrame
|
||
|
assert len(res) == 2
|
||
|
assert len(df.columns) == 1
|
||
|
assert "b" in df.columns
|
||
|
assert "a" not in df.columns
|
||
|
assert len(df.index) == 2
|
||
|
|
||
|
def test_insert_column_bug_4032(self):
|
||
|
|
||
|
# GH4032, inserting a column and renaming causing errors
|
||
|
df = DataFrame({'b': [1.1, 2.2]})
|
||
|
df = df.rename(columns={})
|
||
|
df.insert(0, 'a', [1, 2])
|
||
|
|
||
|
result = df.rename(columns={})
|
||
|
str(result)
|
||
|
expected = DataFrame([[1, 1.1], [2, 2.2]], columns=['a', 'b'])
|
||
|
assert_frame_equal(result, expected)
|
||
|
df.insert(0, 'c', [1.3, 2.3])
|
||
|
|
||
|
result = df.rename(columns={})
|
||
|
str(result)
|
||
|
|
||
|
expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]],
|
||
|
columns=['c', 'a', 'b'])
|
||
|
assert_frame_equal(result, expected)
|