laywerrobot/lib/python3.6/site-packages/pandas/tests/util/test_hashing.py

293 lines
12 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
import pytest
import datetime
from warnings import catch_warnings
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Index, MultiIndex
from pandas.util import hash_array, hash_pandas_object
from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar
import pandas.util.testing as tm
class TestHashing(object):
def setup_method(self, method):
self.df = DataFrame(
{'i32': np.array([1, 2, 3] * 3, dtype='int32'),
'f32': np.array([None, 2.5, 3.5] * 3, dtype='float32'),
'cat': Series(['a', 'b', 'c'] * 3).astype('category'),
'obj': Series(['d', 'e', 'f'] * 3),
'bool': np.array([True, False, True] * 3),
'dt': Series(pd.date_range('20130101', periods=9)),
'dt_tz': Series(pd.date_range('20130101', periods=9,
tz='US/Eastern')),
'td': Series(pd.timedelta_range('2000', periods=9))})
def test_consistency(self):
# check that our hash doesn't change because of a mistake
# in the actual code; this is the ground truth
result = hash_pandas_object(Index(['foo', 'bar', 'baz']))
expected = Series(np.array([3600424527151052760, 1374399572096150070,
477881037637427054], dtype='uint64'),
index=['foo', 'bar', 'baz'])
tm.assert_series_equal(result, expected)
def test_hash_array(self):
for name, s in self.df.iteritems():
a = s.values
tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
def test_hash_array_mixed(self):
result1 = hash_array(np.array([3, 4, 'All']))
result2 = hash_array(np.array(['3', '4', 'All']))
result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
tm.assert_numpy_array_equal(result1, result2)
tm.assert_numpy_array_equal(result1, result3)
def test_hash_array_errors(self):
for val in [5, 'foo', pd.Timestamp('20130101')]:
pytest.raises(TypeError, hash_array, val)
def check_equal(self, obj, **kwargs):
a = hash_pandas_object(obj, **kwargs)
b = hash_pandas_object(obj, **kwargs)
tm.assert_series_equal(a, b)
kwargs.pop('index', None)
a = hash_pandas_object(obj, **kwargs)
b = hash_pandas_object(obj, **kwargs)
tm.assert_series_equal(a, b)
def check_not_equal_with_index(self, obj):
# check that we are not hashing the same if
# we include the index
if not isinstance(obj, Index):
a = hash_pandas_object(obj, index=True)
b = hash_pandas_object(obj, index=False)
if len(obj):
assert not (a == b).all()
def test_hash_tuples(self):
tups = [(1, 'one'), (1, 'two'), (2, 'one')]
result = hash_tuples(tups)
expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values
tm.assert_numpy_array_equal(result, expected)
result = hash_tuples(tups[0])
assert result == expected[0]
def test_hash_tuple(self):
# test equivalence between hash_tuples and hash_tuple
for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'),
('A', pd.Timestamp("2012-01-01"))]:
result = hash_tuple(tup)
expected = hash_tuples([tup])[0]
assert result == expected
def test_hash_scalar(self):
for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01", tz='Europe/Brussels'),
datetime.datetime(2012, 1, 1),
pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(),
pd.Timedelta('1 days'), datetime.timedelta(1),
pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1),
np.nan, pd.NaT, None]:
result = _hash_scalar(val)
expected = hash_array(np.array([val], dtype=object),
categorize=True)
assert result[0] == expected[0]
def test_hash_tuples_err(self):
for val in [5, 'foo', pd.Timestamp('20130101')]:
pytest.raises(TypeError, hash_tuples, val)
def test_multiindex_unique(self):
mi = MultiIndex.from_tuples([(118, 472), (236, 118),
(51, 204), (102, 51)])
assert mi.is_unique
result = hash_pandas_object(mi)
assert result.is_unique
def test_multiindex_objects(self):
mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
names=['col1', 'col2'])
recons = mi._sort_levels_monotonic()
# these are equal
assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))
# _hashed_values and hash_pandas_object(..., index=False)
# equivalency
expected = hash_pandas_object(
mi, index=False).values
result = mi._hashed_values
tm.assert_numpy_array_equal(result, expected)
expected = hash_pandas_object(
recons, index=False).values
result = recons._hashed_values
tm.assert_numpy_array_equal(result, expected)
expected = mi._hashed_values
result = recons._hashed_values
# values should match, but in different order
tm.assert_numpy_array_equal(np.sort(result),
np.sort(expected))
def test_hash_pandas_object(self):
for obj in [Series([1, 2, 3]),
Series([1.0, 1.5, 3.2]),
Series([1.0, 1.5, np.nan]),
Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
Series(['a', 'b', 'c']),
Series(['a', np.nan, 'c']),
Series(['a', None, 'c']),
Series([True, False, True]),
Series(),
Index([1, 2, 3]),
Index([True, False, True]),
DataFrame({'x': ['a', 'b', 'c'], 'y': [1, 2, 3]}),
DataFrame(),
tm.makeMissingDataframe(),
tm.makeMixedDataFrame(),
tm.makeTimeDataFrame(),
tm.makeTimeSeries(),
tm.makeTimedeltaIndex(),
tm.makePeriodIndex(),
Series(tm.makePeriodIndex()),
Series(pd.date_range('20130101',
periods=3, tz='US/Eastern')),
MultiIndex.from_product(
[range(5),
['foo', 'bar', 'baz'],
pd.date_range('20130101', periods=2)]),
MultiIndex.from_product(
[pd.CategoricalIndex(list('aabc')),
range(3)])]:
self.check_equal(obj)
self.check_not_equal_with_index(obj)
def test_hash_pandas_object2(self):
for name, s in self.df.iteritems():
self.check_equal(s)
self.check_not_equal_with_index(s)
def test_hash_pandas_empty_object(self):
for obj in [Series([], dtype='float64'),
Series([], dtype='object'),
Index([])]:
self.check_equal(obj)
# these are by-definition the same with
# or w/o the index as the data is empty
def test_categorical_consistency(self):
# GH15143
# Check that categoricals hash consistent with their values, not codes
# This should work for categoricals of any dtype
for s1 in [Series(['a', 'b', 'c', 'd']),
Series([1000, 2000, 3000, 4000]),
Series(pd.date_range(0, periods=4))]:
s2 = s1.astype('category').cat.set_categories(s1)
s3 = s2.cat.set_categories(list(reversed(s1)))
for categorize in [True, False]:
# These should all hash identically
h1 = hash_pandas_object(s1, categorize=categorize)
h2 = hash_pandas_object(s2, categorize=categorize)
h3 = hash_pandas_object(s3, categorize=categorize)
tm.assert_series_equal(h1, h2)
tm.assert_series_equal(h1, h3)
def test_categorical_with_nan_consistency(self):
c = pd.Categorical.from_codes(
[-1, 0, 1, 2, 3, 4],
categories=pd.date_range('2012-01-01', periods=5, name='B'))
expected = hash_array(c, categorize=False)
c = pd.Categorical.from_codes(
[-1, 0],
categories=[pd.Timestamp('2012-01-01')])
result = hash_array(c, categorize=False)
assert result[0] in expected
assert result[1] in expected
def test_pandas_errors(self):
for obj in [pd.Timestamp('20130101')]:
with pytest.raises(TypeError):
hash_pandas_object(obj)
with catch_warnings(record=True):
obj = tm.makePanel()
with pytest.raises(TypeError):
hash_pandas_object(obj)
def test_hash_keys(self):
# using different hash keys, should have different hashes
# for the same data
# this only matters for object dtypes
obj = Series(list('abc'))
a = hash_pandas_object(obj, hash_key='9876543210123456')
b = hash_pandas_object(obj, hash_key='9876543210123465')
assert (a != b).all()
def test_invalid_key(self):
# this only matters for object dtypes
def f():
hash_pandas_object(Series(list('abc')), hash_key='foo')
pytest.raises(ValueError, f)
def test_alread_encoded(self):
# if already encoded then ok
obj = Series(list('abc')).str.encode('utf8')
self.check_equal(obj)
def test_alternate_encoding(self):
obj = Series(list('abc'))
self.check_equal(obj, encoding='ascii')
def test_same_len_hash_collisions(self):
for l in range(8):
length = 2**(l + 8) + 1
s = tm.rands_array(length, 2)
result = hash_array(s, 'utf8')
assert not result[0] == result[1]
for l in range(8):
length = 2**(l + 8)
s = tm.rands_array(length, 2)
result = hash_array(s, 'utf8')
assert not result[0] == result[1]
def test_hash_collisions(self):
# hash collisions are bad
# https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa
'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa
# these should be different!
result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
expected1 = np.array([14963968704024874985], dtype=np.uint64)
tm.assert_numpy_array_equal(result1, expected1)
result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
expected2 = np.array([16428432627716348016], dtype=np.uint64)
tm.assert_numpy_array_equal(result2, expected2)
result = hash_array(np.asarray(L, dtype=object), 'utf8')
tm.assert_numpy_array_equal(
result, np.concatenate([expected1, expected2], axis=0))