laywerrobot/lib/python3.6/site-packages/sklearn/preprocessing/tests/test_data.py

1978 lines
74 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# Authors:
#
# Giorgio Patrini
#
# License: BSD 3 clause
from __future__ import division
import warnings
import numpy as np
import numpy.linalg as la
from scipy import sparse
from distutils.version import LooseVersion
from sklearn.utils import gen_batches
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import clean_warning_registry
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_array_less
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_greater_equal
from sklearn.utils.testing import assert_less_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_no_warnings
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import skip_if_32bit
from sklearn.utils.sparsefuncs import mean_variance_axis
from sklearn.preprocessing.data import _transform_selected
from sklearn.preprocessing.data import _handle_zeros_in_scale
from sklearn.preprocessing.data import Binarizer
from sklearn.preprocessing.data import KernelCenterer
from sklearn.preprocessing.data import Normalizer
from sklearn.preprocessing.data import normalize
from sklearn.preprocessing.data import OneHotEncoder
from sklearn.preprocessing.data import StandardScaler
from sklearn.preprocessing.data import scale
from sklearn.preprocessing.data import MinMaxScaler
from sklearn.preprocessing.data import minmax_scale
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing.data import quantile_transform
from sklearn.preprocessing.data import MaxAbsScaler
from sklearn.preprocessing.data import maxabs_scale
from sklearn.preprocessing.data import RobustScaler
from sklearn.preprocessing.data import robust_scale
from sklearn.preprocessing.data import add_dummy_feature
from sklearn.preprocessing.data import PolynomialFeatures
from sklearn.exceptions import DataConversionWarning
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVR
from sklearn import datasets
iris = datasets.load_iris()
# Make some data to be used many times
rng = np.random.RandomState(0)
n_features = 30
n_samples = 1000
offsets = rng.uniform(-1, 1, size=n_features)
scales = rng.uniform(1, 10, size=n_features)
X_2d = rng.randn(n_samples, n_features) * scales + offsets
X_1row = X_2d[0, :].reshape(1, n_features)
X_1col = X_2d[:, 0].reshape(n_samples, 1)
X_list_1row = X_1row.tolist()
X_list_1col = X_1col.tolist()
def toarray(a):
if hasattr(a, "toarray"):
a = a.toarray()
return a
def _check_dim_1axis(a):
if isinstance(a, list):
return np.array(a).shape[0]
return a.shape[0]
def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size,
n_samples_seen):
if batch_stop != n:
assert_equal((i + 1) * chunk_size, n_samples_seen)
else:
assert_equal(i * chunk_size + (batch_stop - batch_start),
n_samples_seen)
def test_polynomial_features():
# Test Polynomial Features
X1 = np.arange(6)[:, np.newaxis]
P1 = np.hstack([np.ones_like(X1),
X1, X1 ** 2, X1 ** 3])
deg1 = 3
X2 = np.arange(6).reshape((3, 2))
x1 = X2[:, :1]
x2 = X2[:, 1:]
P2 = np.hstack([x1 ** 0 * x2 ** 0,
x1 ** 1 * x2 ** 0,
x1 ** 0 * x2 ** 1,
x1 ** 2 * x2 ** 0,
x1 ** 1 * x2 ** 1,
x1 ** 0 * x2 ** 2])
deg2 = 2
for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]:
P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X)
assert_array_almost_equal(P_test, P)
P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X)
assert_array_almost_equal(P_test, P[:, 1:])
interact = PolynomialFeatures(2, interaction_only=True, include_bias=True)
X_poly = interact.fit_transform(X)
assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
assert_equal(interact.powers_.shape, (interact.n_output_features_,
interact.n_input_features_))
def test_polynomial_feature_names():
X = np.arange(30).reshape(10, 3)
poly = PolynomialFeatures(degree=2, include_bias=True).fit(X)
feature_names = poly.get_feature_names()
assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1',
'x0 x2', 'x1^2', 'x1 x2', 'x2^2'],
feature_names)
poly = PolynomialFeatures(degree=3, include_bias=False).fit(X)
feature_names = poly.get_feature_names(["a", "b", "c"])
assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2',
'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c',
'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c',
'b c^2', 'c^3'], feature_names)
# test some unicode
poly = PolynomialFeatures(degree=1, include_bias=True).fit(X)
feature_names = poly.get_feature_names(
[u"\u0001F40D", u"\u262E", u"\u05D0"])
assert_array_equal([u"1", u"\u0001F40D", u"\u262E", u"\u05D0"],
feature_names)
def test_standard_scaler_1d():
# Test scaling of dataset along single axis
for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X, copy=True)
if isinstance(X, list):
X = np.array(X) # cast only after scaling done
if _check_dim_1axis(X) == 1:
assert_almost_equal(scaler.mean_, X.ravel())
assert_almost_equal(scaler.scale_, np.ones(n_features))
assert_array_almost_equal(X_scaled.mean(axis=0),
np.zeros_like(n_features))
assert_array_almost_equal(X_scaled.std(axis=0),
np.zeros_like(n_features))
else:
assert_almost_equal(scaler.mean_, X.mean())
assert_almost_equal(scaler.scale_, X.std())
assert_array_almost_equal(X_scaled.mean(axis=0),
np.zeros_like(n_features))
assert_array_almost_equal(X_scaled.mean(axis=0), .0)
assert_array_almost_equal(X_scaled.std(axis=0), 1.)
assert_equal(scaler.n_samples_seen_, X.shape[0])
# check inverse transform
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_array_almost_equal(X_scaled_back, X)
# Constant feature
X = np.ones(5).reshape(5, 1)
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X, copy=True)
assert_almost_equal(scaler.mean_, 1.)
assert_almost_equal(scaler.scale_, 1.)
assert_array_almost_equal(X_scaled.mean(axis=0), .0)
assert_array_almost_equal(X_scaled.std(axis=0), .0)
assert_equal(scaler.n_samples_seen_, X.shape[0])
def test_scale_1d():
# 1-d inputs
X_list = [1., 3., 5., 0.]
X_arr = np.array(X_list)
for X in [X_list, X_arr]:
X_scaled = scale(X)
assert_array_almost_equal(X_scaled.mean(), 0.0)
assert_array_almost_equal(X_scaled.std(), 1.0)
assert_array_equal(scale(X, with_mean=False, with_std=False), X)
@skip_if_32bit
def test_standard_scaler_numerical_stability():
# Test numerical stability of scaling
# np.log(1e-5) is taken because of its floating point representation
# was empirically found to cause numerical problems with np.mean & np.std.
x = np.zeros(8, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
if LooseVersion(np.__version__) >= LooseVersion('1.9'):
# This does not raise a warning as the number of samples is too low
# to trigger the problem in recent numpy
x_scaled = assert_no_warnings(scale, x)
assert_array_almost_equal(scale(x), np.zeros(8))
else:
w = "standard deviation of the data is probably very close to 0"
x_scaled = assert_warns_message(UserWarning, w, scale, x)
assert_array_almost_equal(x_scaled, np.zeros(8))
# with 2 more samples, the std computation run into numerical issues:
x = np.zeros(10, dtype=np.float64) + np.log(1e-5, dtype=np.float64)
w = "standard deviation of the data is probably very close to 0"
x_scaled = assert_warns_message(UserWarning, w, scale, x)
assert_array_almost_equal(x_scaled, np.zeros(10))
x = np.ones(10, dtype=np.float64) * 1e-100
x_small_scaled = assert_no_warnings(scale, x)
assert_array_almost_equal(x_small_scaled, np.zeros(10))
# Large values can cause (often recoverable) numerical stability issues:
x_big = np.ones(10, dtype=np.float64) * 1e100
w = "Dataset may contain too large values"
x_big_scaled = assert_warns_message(UserWarning, w, scale, x_big)
assert_array_almost_equal(x_big_scaled, np.zeros(10))
assert_array_almost_equal(x_big_scaled, x_small_scaled)
x_big_centered = assert_warns_message(UserWarning, w, scale, x_big,
with_std=False)
assert_array_almost_equal(x_big_centered, np.zeros(10))
assert_array_almost_equal(x_big_centered, x_small_scaled)
def test_scaler_2d_arrays():
# Test scaling of 2d array along first axis
rng = np.random.RandomState(0)
n_features = 5
n_samples = 4
X = rng.randn(n_samples, n_features)
X[:, 0] = 0.0 # first feature is always of zero
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X, copy=True)
assert_false(np.any(np.isnan(X_scaled)))
assert_equal(scaler.n_samples_seen_, n_samples)
assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
# Check that X has been copied
assert_true(X_scaled is not X)
# check inverse transform
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_true(X_scaled_back is not X)
assert_true(X_scaled_back is not X_scaled)
assert_array_almost_equal(X_scaled_back, X)
X_scaled = scale(X, axis=1, with_std=False)
assert_false(np.any(np.isnan(X_scaled)))
assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
X_scaled = scale(X, axis=1, with_std=True)
assert_false(np.any(np.isnan(X_scaled)))
assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0])
# Check that the data hasn't been modified
assert_true(X_scaled is not X)
X_scaled = scaler.fit(X).transform(X, copy=False)
assert_false(np.any(np.isnan(X_scaled)))
assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
# Check that X has not been copied
assert_true(X_scaled is X)
X = rng.randn(4, 5)
X[:, 0] = 1.0 # first feature is a constant, non zero feature
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X, copy=True)
assert_false(np.any(np.isnan(X_scaled)))
assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
# Check that X has not been copied
assert_true(X_scaled is not X)
def test_handle_zeros_in_scale():
s1 = np.array([0, 1, 2, 3])
s2 = _handle_zeros_in_scale(s1, copy=True)
assert_false(s1[0] == s2[0])
assert_array_equal(s1, np.array([0, 1, 2, 3]))
assert_array_equal(s2, np.array([1, 1, 2, 3]))
def test_minmax_scaler_partial_fit():
# Test if partial_fit run over many batches of size 1 and 50
# gives the same results as fit
X = X_2d
n = X.shape[0]
for chunk_size in [1, 2, 50, n, n + 42]:
# Test mean at the end of the process
scaler_batch = MinMaxScaler().fit(X)
scaler_incr = MinMaxScaler()
for batch in gen_batches(n_samples, chunk_size):
scaler_incr = scaler_incr.partial_fit(X[batch])
assert_array_almost_equal(scaler_batch.data_min_,
scaler_incr.data_min_)
assert_array_almost_equal(scaler_batch.data_max_,
scaler_incr.data_max_)
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
assert_array_almost_equal(scaler_batch.data_range_,
scaler_incr.data_range_)
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
# Test std after 1 step
batch0 = slice(0, chunk_size)
scaler_batch = MinMaxScaler().fit(X[batch0])
scaler_incr = MinMaxScaler().partial_fit(X[batch0])
assert_array_almost_equal(scaler_batch.data_min_,
scaler_incr.data_min_)
assert_array_almost_equal(scaler_batch.data_max_,
scaler_incr.data_max_)
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
assert_array_almost_equal(scaler_batch.data_range_,
scaler_incr.data_range_)
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
# Test std until the end of partial fits, and
scaler_batch = MinMaxScaler().fit(X)
scaler_incr = MinMaxScaler() # Clean estimator
for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
scaler_incr = scaler_incr.partial_fit(X[batch])
assert_correct_incr(i, batch_start=batch.start,
batch_stop=batch.stop, n=n,
chunk_size=chunk_size,
n_samples_seen=scaler_incr.n_samples_seen_)
def test_standard_scaler_partial_fit():
# Test if partial_fit run over many batches of size 1 and 50
# gives the same results as fit
X = X_2d
n = X.shape[0]
for chunk_size in [1, 2, 50, n, n + 42]:
# Test mean at the end of the process
scaler_batch = StandardScaler(with_std=False).fit(X)
scaler_incr = StandardScaler(with_std=False)
for batch in gen_batches(n_samples, chunk_size):
scaler_incr = scaler_incr.partial_fit(X[batch])
assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
# Test std after 1 step
batch0 = slice(0, chunk_size)
scaler_incr = StandardScaler().partial_fit(X[batch0])
if chunk_size == 1:
assert_array_almost_equal(np.zeros(n_features, dtype=np.float64),
scaler_incr.var_)
assert_array_almost_equal(np.ones(n_features, dtype=np.float64),
scaler_incr.scale_)
else:
assert_array_almost_equal(np.var(X[batch0], axis=0),
scaler_incr.var_)
assert_array_almost_equal(np.std(X[batch0], axis=0),
scaler_incr.scale_) # no constants
# Test std until the end of partial fits, and
scaler_batch = StandardScaler().fit(X)
scaler_incr = StandardScaler() # Clean estimator
for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
scaler_incr = scaler_incr.partial_fit(X[batch])
assert_correct_incr(i, batch_start=batch.start,
batch_stop=batch.stop, n=n,
chunk_size=chunk_size,
n_samples_seen=scaler_incr.n_samples_seen_)
assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
def test_standard_scaler_partial_fit_numerical_stability():
# Test if the incremental computation introduces significative errors
# for large datasets with values of large magniture
rng = np.random.RandomState(0)
n_features = 2
n_samples = 100
offsets = rng.uniform(-1e15, 1e15, size=n_features)
scales = rng.uniform(1e3, 1e6, size=n_features)
X = rng.randn(n_samples, n_features) * scales + offsets
scaler_batch = StandardScaler().fit(X)
scaler_incr = StandardScaler()
for chunk in X:
scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
# Regardless of abs values, they must not be more diff 6 significant digits
tol = 10 ** (-6)
assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
# NOTE Be aware that for much larger offsets std is very unstable (last
# assert) while mean is OK.
# Sparse input
size = (100, 3)
scale = 1e20
X = rng.randint(0, 2, size).astype(np.float64) * scale
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
for X in [X_csr, X_csc]:
# with_mean=False is required with sparse input
scaler = StandardScaler(with_mean=False).fit(X)
scaler_incr = StandardScaler(with_mean=False)
for chunk in X:
# chunk = sparse.csr_matrix(data_chunks)
scaler_incr = scaler_incr.partial_fit(chunk)
# Regardless of magnitude, they must not differ more than of 6 digits
tol = 10 ** (-6)
assert_true(scaler.mean_ is not None)
assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol)
assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol)
def test_partial_fit_sparse_input():
# Check that sparsity is not destroyed
X = np.array([[1.], [0.], [0.], [5.]])
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
for X in [X_csr, X_csc]:
X_null = null_transform.partial_fit(X).transform(X)
assert_array_equal(X_null.data, X.data)
X_orig = null_transform.inverse_transform(X_null)
assert_array_equal(X_orig.data, X_null.data)
assert_array_equal(X_orig.data, X.data)
def test_standard_scaler_trasform_with_partial_fit():
# Check some postconditions after applying partial_fit and transform
X = X_2d[:100, :]
scaler_incr = StandardScaler()
for i, batch in enumerate(gen_batches(X.shape[0], 1)):
X_sofar = X[:(i + 1), :]
chunks_copy = X_sofar.copy()
scaled_batch = StandardScaler().fit_transform(X_sofar)
scaler_incr = scaler_incr.partial_fit(X[batch])
scaled_incr = scaler_incr.transform(X_sofar)
assert_array_almost_equal(scaled_batch, scaled_incr)
assert_array_almost_equal(X_sofar, chunks_copy) # No change
right_input = scaler_incr.inverse_transform(scaled_incr)
assert_array_almost_equal(X_sofar, right_input)
zero = np.zeros(X.shape[1])
epsilon = np.nextafter(0, 1)
assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal
assert_array_less(zero, scaler_incr.scale_ + epsilon)
# (i+1) because the Scaler has been already fitted
assert_equal((i + 1), scaler_incr.n_samples_seen_)
def test_min_max_scaler_iris():
X = iris.data
scaler = MinMaxScaler()
# default params
X_trans = scaler.fit_transform(X)
assert_array_almost_equal(X_trans.min(axis=0), 0)
assert_array_almost_equal(X_trans.max(axis=0), 1)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
# not default params: min=1, max=2
scaler = MinMaxScaler(feature_range=(1, 2))
X_trans = scaler.fit_transform(X)
assert_array_almost_equal(X_trans.min(axis=0), 1)
assert_array_almost_equal(X_trans.max(axis=0), 2)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
# min=-.5, max=.6
scaler = MinMaxScaler(feature_range=(-.5, .6))
X_trans = scaler.fit_transform(X)
assert_array_almost_equal(X_trans.min(axis=0), -.5)
assert_array_almost_equal(X_trans.max(axis=0), .6)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
# raises on invalid range
scaler = MinMaxScaler(feature_range=(2, 1))
assert_raises(ValueError, scaler.fit, X)
def test_min_max_scaler_zero_variance_features():
# Check min max scaler on toy data with zero variance features
X = [[0., 1., +0.5],
[0., 1., -0.1],
[0., 1., +1.1]]
X_new = [[+0., 2., 0.5],
[-1., 1., 0.0],
[+0., 1., 1.5]]
# default params
scaler = MinMaxScaler()
X_trans = scaler.fit_transform(X)
X_expected_0_1 = [[0., 0., 0.5],
[0., 0., 0.0],
[0., 0., 1.0]]
assert_array_almost_equal(X_trans, X_expected_0_1)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
X_trans_new = scaler.transform(X_new)
X_expected_0_1_new = [[+0., 1., 0.500],
[-1., 0., 0.083],
[+0., 0., 1.333]]
assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)
# not default params
scaler = MinMaxScaler(feature_range=(1, 2))
X_trans = scaler.fit_transform(X)
X_expected_1_2 = [[1., 1., 1.5],
[1., 1., 1.0],
[1., 1., 2.0]]
assert_array_almost_equal(X_trans, X_expected_1_2)
# function interface
X_trans = minmax_scale(X)
assert_array_almost_equal(X_trans, X_expected_0_1)
X_trans = minmax_scale(X, feature_range=(1, 2))
assert_array_almost_equal(X_trans, X_expected_1_2)
def test_minmax_scale_axis1():
X = iris.data
X_trans = minmax_scale(X, axis=1)
assert_array_almost_equal(np.min(X_trans, axis=1), 0)
assert_array_almost_equal(np.max(X_trans, axis=1), 1)
def test_min_max_scaler_1d():
# Test scaling of dataset along single axis
for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
scaler = MinMaxScaler(copy=True)
X_scaled = scaler.fit(X).transform(X)
if isinstance(X, list):
X = np.array(X) # cast only after scaling done
if _check_dim_1axis(X) == 1:
assert_array_almost_equal(X_scaled.min(axis=0),
np.zeros(n_features))
assert_array_almost_equal(X_scaled.max(axis=0),
np.zeros(n_features))
else:
assert_array_almost_equal(X_scaled.min(axis=0), .0)
assert_array_almost_equal(X_scaled.max(axis=0), 1.)
assert_equal(scaler.n_samples_seen_, X.shape[0])
# check inverse transform
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_array_almost_equal(X_scaled_back, X)
# Constant feature
X = np.ones(5).reshape(5, 1)
scaler = MinMaxScaler()
X_scaled = scaler.fit(X).transform(X)
assert_greater_equal(X_scaled.min(), 0.)
assert_less_equal(X_scaled.max(), 1.)
assert_equal(scaler.n_samples_seen_, X.shape[0])
# Function interface
X_1d = X_1row.ravel()
min_ = X_1d.min()
max_ = X_1d.max()
assert_array_almost_equal((X_1d - min_) / (max_ - min_),
minmax_scale(X_1d, copy=True))
def test_scaler_without_centering():
rng = np.random.RandomState(42)
X = rng.randn(4, 5)
X[:, 0] = 0.0 # first feature is always of zero
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
assert_raises(ValueError, StandardScaler().fit, X_csr)
assert_raises(ValueError, StandardScaler().fit, X_csc)
null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
X_null = null_transform.fit_transform(X_csr)
assert_array_equal(X_null.data, X_csr.data)
X_orig = null_transform.inverse_transform(X_null)
assert_array_equal(X_orig.data, X_csr.data)
scaler = StandardScaler(with_mean=False).fit(X)
X_scaled = scaler.transform(X, copy=True)
assert_false(np.any(np.isnan(X_scaled)))
scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
assert_false(np.any(np.isnan(X_csr_scaled.data)))
scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
X_csc_scaled = scaler_csc.transform(X_csc, copy=True)
assert_false(np.any(np.isnan(X_csc_scaled.data)))
assert_array_almost_equal(scaler.mean_, scaler_csr.mean_)
assert_array_almost_equal(scaler.var_, scaler_csr.var_)
assert_array_almost_equal(scaler.scale_, scaler_csr.scale_)
assert_array_almost_equal(scaler.mean_, scaler_csc.mean_)
assert_array_almost_equal(scaler.var_, scaler_csc.var_)
assert_array_almost_equal(scaler.scale_, scaler_csc.scale_)
assert_array_almost_equal(
X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
# Check that X has not been modified (copy)
assert_true(X_scaled is not X)
assert_true(X_csr_scaled is not X_csr)
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_true(X_scaled_back is not X)
assert_true(X_scaled_back is not X_scaled)
assert_array_almost_equal(X_scaled_back, X)
X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
assert_true(X_csr_scaled_back is not X_csr)
assert_true(X_csr_scaled_back is not X_csr_scaled)
assert_array_almost_equal(X_csr_scaled_back.toarray(), X)
X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
assert_true(X_csc_scaled_back is not X_csc)
assert_true(X_csc_scaled_back is not X_csc_scaled)
assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
def test_scaler_int():
# test that scaler converts integer input to floating
# for both sparse and dense matrices
rng = np.random.RandomState(42)
X = rng.randint(20, size=(4, 5))
X[:, 0] = 0 # first feature is always of zero
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
clean_warning_registry()
with warnings.catch_warnings(record=True):
X_null = null_transform.fit_transform(X_csr)
assert_array_equal(X_null.data, X_csr.data)
X_orig = null_transform.inverse_transform(X_null)
assert_array_equal(X_orig.data, X_csr.data)
clean_warning_registry()
with warnings.catch_warnings(record=True):
scaler = StandardScaler(with_mean=False).fit(X)
X_scaled = scaler.transform(X, copy=True)
assert_false(np.any(np.isnan(X_scaled)))
clean_warning_registry()
with warnings.catch_warnings(record=True):
scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
assert_false(np.any(np.isnan(X_csr_scaled.data)))
clean_warning_registry()
with warnings.catch_warnings(record=True):
scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
X_csc_scaled = scaler_csc.transform(X_csc, copy=True)
assert_false(np.any(np.isnan(X_csc_scaled.data)))
assert_array_almost_equal(scaler.mean_, scaler_csr.mean_)
assert_array_almost_equal(scaler.var_, scaler_csr.var_)
assert_array_almost_equal(scaler.scale_, scaler_csr.scale_)
assert_array_almost_equal(scaler.mean_, scaler_csc.mean_)
assert_array_almost_equal(scaler.var_, scaler_csc.var_)
assert_array_almost_equal(scaler.scale_, scaler_csc.scale_)
assert_array_almost_equal(
X_scaled.mean(axis=0),
[0., 1.109, 1.856, 21., 1.559], 2)
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(
X_csr_scaled.astype(np.float), 0)
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
# Check that X has not been modified (copy)
assert_true(X_scaled is not X)
assert_true(X_csr_scaled is not X_csr)
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_true(X_scaled_back is not X)
assert_true(X_scaled_back is not X_scaled)
assert_array_almost_equal(X_scaled_back, X)
X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
assert_true(X_csr_scaled_back is not X_csr)
assert_true(X_csr_scaled_back is not X_csr_scaled)
assert_array_almost_equal(X_csr_scaled_back.toarray(), X)
X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
assert_true(X_csc_scaled_back is not X_csc)
assert_true(X_csc_scaled_back is not X_csc_scaled)
assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
def test_scaler_without_copy():
# Check that StandardScaler.fit does not change input
rng = np.random.RandomState(42)
X = rng.randn(4, 5)
X[:, 0] = 0.0 # first feature is always of zero
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
X_copy = X.copy()
StandardScaler(copy=False).fit(X)
assert_array_equal(X, X_copy)
X_csr_copy = X_csr.copy()
StandardScaler(with_mean=False, copy=False).fit(X_csr)
assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
X_csc_copy = X_csc.copy()
StandardScaler(with_mean=False, copy=False).fit(X_csc)
assert_array_equal(X_csc.toarray(), X_csc_copy.toarray())
def test_scale_sparse_with_mean_raise_exception():
rng = np.random.RandomState(42)
X = rng.randn(4, 5)
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
# check scaling and fit with direct calls on sparse data
assert_raises(ValueError, scale, X_csr, with_mean=True)
assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)
assert_raises(ValueError, scale, X_csc, with_mean=True)
assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csc)
# check transform and inverse_transform after a fit on a dense array
scaler = StandardScaler(with_mean=True).fit(X)
assert_raises(ValueError, scaler.transform, X_csr)
assert_raises(ValueError, scaler.transform, X_csc)
X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
X_transformed_csc = sparse.csc_matrix(scaler.transform(X))
assert_raises(ValueError, scaler.inverse_transform, X_transformed_csc)
def test_scale_input_finiteness_validation():
# Check if non finite inputs raise ValueError
X = [[np.nan, 5, 6, 7, 8]]
assert_raises_regex(ValueError,
"Input contains NaN, infinity or a value too large",
scale, X)
X = [[np.inf, 5, 6, 7, 8]]
assert_raises_regex(ValueError,
"Input contains NaN, infinity or a value too large",
scale, X)
def test_robust_scaler_2d_arrays():
# Test robust scaling of 2d array along first axis
rng = np.random.RandomState(0)
X = rng.randn(4, 5)
X[:, 0] = 0.0 # first feature is always of zero
scaler = RobustScaler()
X_scaled = scaler.fit(X).transform(X)
assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0])
assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
def test_robust_scaler_transform_one_row_csr():
# Check RobustScaler on transforming csr matrix with one row
rng = np.random.RandomState(0)
X = rng.randn(4, 5)
single_row = np.array([[0.1, 1., 2., 0., -1.]])
scaler = RobustScaler(with_centering=False)
scaler = scaler.fit(X)
row_trans = scaler.transform(sparse.csr_matrix(single_row))
row_expected = single_row / scaler.scale_
assert_array_almost_equal(row_trans.toarray(), row_expected)
row_scaled_back = scaler.inverse_transform(row_trans)
assert_array_almost_equal(single_row, row_scaled_back.toarray())
def test_robust_scaler_iris():
X = iris.data
scaler = RobustScaler()
X_trans = scaler.fit_transform(X)
assert_array_almost_equal(np.median(X_trans, axis=0), 0)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
q = np.percentile(X_trans, q=(25, 75), axis=0)
iqr = q[1] - q[0]
assert_array_almost_equal(iqr, 1)
def test_robust_scaler_iris_quantiles():
X = iris.data
scaler = RobustScaler(quantile_range=(10, 90))
X_trans = scaler.fit_transform(X)
assert_array_almost_equal(np.median(X_trans, axis=0), 0)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
q = np.percentile(X_trans, q=(10, 90), axis=0)
q_range = q[1] - q[0]
assert_array_almost_equal(q_range, 1)
def test_quantile_transform_iris():
X = iris.data
# uniform output distribution
transformer = QuantileTransformer(n_quantiles=30)
X_trans = transformer.fit_transform(X)
X_trans_inv = transformer.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
# normal output distribution
transformer = QuantileTransformer(n_quantiles=30,
output_distribution='normal')
X_trans = transformer.fit_transform(X)
X_trans_inv = transformer.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
# make sure it is possible to take the inverse of a sparse matrix
# which contain negative value; this is the case in the iris dataset
X_sparse = sparse.csc_matrix(X)
X_sparse_tran = transformer.fit_transform(X_sparse)
X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran)
assert_array_almost_equal(X_sparse.A, X_sparse_tran_inv.A)
def test_quantile_transform_check_error():
X = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
[2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
[0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]])
X = sparse.csc_matrix(X)
X_neg = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
[-2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
[0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]])
X_neg = sparse.csc_matrix(X_neg)
assert_raises_regex(ValueError, "Invalid value for 'n_quantiles': 0.",
QuantileTransformer(n_quantiles=0).fit, X)
assert_raises_regex(ValueError, "Invalid value for 'subsample': 0.",
QuantileTransformer(subsample=0).fit, X)
assert_raises_regex(ValueError, "The number of quantiles cannot be"
" greater than the number of samples used. Got"
" 1000 quantiles and 10 samples.",
QuantileTransformer(subsample=10).fit, X)
transformer = QuantileTransformer(n_quantiles=10)
assert_raises_regex(ValueError, "QuantileTransformer only accepts "
"non-negative sparse matrices.",
transformer.fit, X_neg)
transformer.fit(X)
assert_raises_regex(ValueError, "QuantileTransformer only accepts "
"non-negative sparse matrices.",
transformer.transform, X_neg)
X_bad_feat = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
[0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]])
assert_raises_regex(ValueError, "X does not have the same number of "
"features as the previously fitted data. Got 2"
" instead of 3.",
transformer.transform, X_bad_feat)
assert_raises_regex(ValueError, "X does not have the same number of "
"features as the previously fitted data. Got 2"
" instead of 3.",
transformer.inverse_transform, X_bad_feat)
transformer = QuantileTransformer(n_quantiles=10,
output_distribution='rnd')
# check that an error is raised at fit time
assert_raises_regex(ValueError, "'output_distribution' has to be either"
" 'normal' or 'uniform'. Got 'rnd' instead.",
transformer.fit, X)
# check that an error is raised at transform time
transformer.output_distribution = 'uniform'
transformer.fit(X)
X_tran = transformer.transform(X)
transformer.output_distribution = 'rnd'
assert_raises_regex(ValueError, "'output_distribution' has to be either"
" 'normal' or 'uniform'. Got 'rnd' instead.",
transformer.transform, X)
# check that an error is raised at inverse_transform time
assert_raises_regex(ValueError, "'output_distribution' has to be either"
" 'normal' or 'uniform'. Got 'rnd' instead.",
transformer.inverse_transform, X_tran)
def test_quantile_transform_sparse_ignore_zeros():
X = np.array([[0, 1],
[0, 0],
[0, 2],
[0, 2],
[0, 1]])
X_sparse = sparse.csc_matrix(X)
transformer = QuantileTransformer(ignore_implicit_zeros=True,
n_quantiles=5)
# dense case -> warning raise
assert_warns_message(UserWarning, "'ignore_implicit_zeros' takes effect"
" only with sparse matrix. This parameter has no"
" effect.", transformer.fit, X)
X_expected = np.array([[0, 0],
[0, 0],
[0, 1],
[0, 1],
[0, 0]])
X_trans = transformer.fit_transform(X_sparse)
assert_almost_equal(X_expected, X_trans.A)
# consider the case where sparse entries are missing values and user-given
# zeros are to be considered
X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0])
X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8])
X_sparse = sparse.csc_matrix((X_data, (X_row, X_col)))
X_trans = transformer.fit_transform(X_sparse)
X_expected = np.array([[0., 0.5],
[0., 0.],
[0., 1.],
[0., 1.],
[0., 0.5],
[0., 0.],
[0., 0.5],
[0., 1.],
[0., 0.]])
assert_almost_equal(X_expected, X_trans.A)
transformer = QuantileTransformer(ignore_implicit_zeros=True,
n_quantiles=5)
X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1])
X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1])
X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6])
X_sparse = sparse.csc_matrix((X_data, (X_row, X_col)))
X_trans = transformer.fit_transform(X_sparse)
X_expected = np.array([[0, 1],
[0, 0.375],
[0, 0.375],
[0, 0.375],
[0, 1],
[0, 0],
[0, 1]])
assert_almost_equal(X_expected, X_trans.A)
assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A)
# check in conjunction with subsampling
transformer = QuantileTransformer(ignore_implicit_zeros=True,
n_quantiles=5,
subsample=8,
random_state=0)
X_trans = transformer.fit_transform(X_sparse)
assert_almost_equal(X_expected, X_trans.A)
assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A)
def test_quantile_transform_dense_toy():
X = np.array([[0, 2, 2.6],
[25, 4, 4.1],
[50, 6, 2.3],
[75, 8, 9.5],
[100, 10, 0.1]])
transformer = QuantileTransformer(n_quantiles=5)
transformer.fit(X)
# using the a uniform output, each entry of X should be map between 0 and 1
# and equally spaced
X_trans = transformer.fit_transform(X)
X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T
assert_almost_equal(np.sort(X_trans, axis=0), X_expected)
X_test = np.array([
[-1, 1, 0],
[101, 11, 10],
])
X_expected = np.array([
[0, 0, 0],
[1, 1, 1],
])
assert_array_almost_equal(transformer.transform(X_test), X_expected)
X_trans_inv = transformer.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
def test_quantile_transform_subsampling():
# Test that subsampling the input yield to a consistent results We check
# that the computed quantiles are almost mapped to a [0, 1] vector where
# values are equally spaced. The infinite norm is checked to be smaller
# than a given threshold. This is repeated 5 times.
# dense support
n_samples = 1000000
n_quantiles = 1000
X = np.sort(np.random.sample((n_samples, 1)), axis=0)
ROUND = 5
inf_norm_arr = []
for random_state in range(ROUND):
transformer = QuantileTransformer(random_state=random_state,
n_quantiles=n_quantiles,
subsample=n_samples // 10)
transformer.fit(X)
diff = (np.linspace(0, 1, n_quantiles) -
np.ravel(transformer.quantiles_))
inf_norm = np.max(np.abs(diff))
assert_true(inf_norm < 1e-2)
inf_norm_arr.append(inf_norm)
# each random subsampling yield a unique approximation to the expected
# linspace CDF
assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr))
# sparse support
# TODO: rng should be seeded once we drop support for older versions of
# scipy (< 0.13) that don't support seeding.
X = sparse.rand(n_samples, 1, density=.99, format='csc')
inf_norm_arr = []
for random_state in range(ROUND):
transformer = QuantileTransformer(random_state=random_state,
n_quantiles=n_quantiles,
subsample=n_samples // 10)
transformer.fit(X)
diff = (np.linspace(0, 1, n_quantiles) -
np.ravel(transformer.quantiles_))
inf_norm = np.max(np.abs(diff))
assert_true(inf_norm < 1e-1)
inf_norm_arr.append(inf_norm)
# each random subsampling yield a unique approximation to the expected
# linspace CDF
assert_equal(len(np.unique(inf_norm_arr)), len(inf_norm_arr))
def test_quantile_transform_sparse_toy():
X = np.array([[0., 2., 0.],
[25., 4., 0.],
[50., 0., 2.6],
[0., 0., 4.1],
[0., 6., 0.],
[0., 8., 0.],
[75., 0., 2.3],
[0., 10., 0.],
[0., 0., 9.5],
[100., 0., 0.1]])
X = sparse.csc_matrix(X)
transformer = QuantileTransformer(n_quantiles=10)
transformer.fit(X)
X_trans = transformer.fit_transform(X)
assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.)
assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.)
X_trans_inv = transformer.inverse_transform(X_trans)
assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
transformer_dense = QuantileTransformer(n_quantiles=10).fit(
X.toarray())
X_trans = transformer_dense.transform(X)
assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.)
assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.)
X_trans_inv = transformer_dense.inverse_transform(X_trans)
assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
def test_quantile_transform_axis1():
X = np.array([[0, 25, 50, 75, 100],
[2, 4, 6, 8, 10],
[2.6, 4.1, 2.3, 9.5, 0.1]])
X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5)
X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5)
assert_array_almost_equal(X_trans_a0, X_trans_a1.T)
def test_quantile_transform_bounds():
# Lower and upper bounds are manually mapped. We checked that in the case
# of a constant feature and binary feature, the bounds are properly mapped.
X_dense = np.array([[0, 0],
[0, 0],
[1, 0]])
X_sparse = sparse.csc_matrix(X_dense)
# check sparse and dense are consistent
X_trans = QuantileTransformer(n_quantiles=3,
random_state=0).fit_transform(X_dense)
assert_array_almost_equal(X_trans, X_dense)
X_trans_sp = QuantileTransformer(n_quantiles=3,
random_state=0).fit_transform(X_sparse)
assert_array_almost_equal(X_trans_sp.A, X_dense)
assert_array_almost_equal(X_trans, X_trans_sp.A)
# check the consistency of the bounds by learning on 1 matrix
# and transforming another
X = np.array([[0, 1],
[0, 0.5],
[1, 0]])
X1 = np.array([[0, 0.1],
[0, 0.5],
[1, 0.1]])
transformer = QuantileTransformer(n_quantiles=3).fit(X)
X_trans = transformer.transform(X1)
assert_array_almost_equal(X_trans, X1)
# check that values outside of the range learned will be mapped properly.
X = np.random.random((1000, 1))
transformer = QuantileTransformer()
transformer.fit(X)
assert_equal(transformer.transform(-10), transformer.transform(np.min(X)))
assert_equal(transformer.transform(10), transformer.transform(np.max(X)))
assert_equal(transformer.inverse_transform(-10),
transformer.inverse_transform(
np.min(transformer.references_)))
assert_equal(transformer.inverse_transform(10),
transformer.inverse_transform(
np.max(transformer.references_)))
def test_quantile_transform_and_inverse():
# iris dataset
X = iris.data
transformer = QuantileTransformer(n_quantiles=1000, random_state=0)
X_trans = transformer.fit_transform(X)
X_trans_inv = transformer.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
def test_robust_scaler_invalid_range():
for range_ in [
(-1, 90),
(-2, -3),
(10, 101),
(100.5, 101),
(90, 50),
]:
scaler = RobustScaler(quantile_range=range_)
assert_raises_regex(ValueError, 'Invalid quantile range: \(',
scaler.fit, iris.data)
def test_scale_function_without_centering():
rng = np.random.RandomState(42)
X = rng.randn(4, 5)
X[:, 0] = 0.0 # first feature is always of zero
X_csr = sparse.csr_matrix(X)
X_scaled = scale(X, with_mean=False)
assert_false(np.any(np.isnan(X_scaled)))
X_csr_scaled = scale(X_csr, with_mean=False)
assert_false(np.any(np.isnan(X_csr_scaled.data)))
# test csc has same outcome
X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())
# raises value error on axis != 0
assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)
assert_array_almost_equal(X_scaled.mean(axis=0),
[0., -0.01, 2.24, -0.35, -0.78], 2)
assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
# Check that X has not been copied
assert_true(X_scaled is not X)
X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
# null scale
X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
def test_robust_scale_axis1():
X = iris.data
X_trans = robust_scale(X, axis=1)
assert_array_almost_equal(np.median(X_trans, axis=1), 0)
q = np.percentile(X_trans, q=(25, 75), axis=1)
iqr = q[1] - q[0]
assert_array_almost_equal(iqr, 1)
def test_robust_scaler_zero_variance_features():
# Check RobustScaler on toy data with zero variance features
X = [[0., 1., +0.5],
[0., 1., -0.1],
[0., 1., +1.1]]
scaler = RobustScaler()
X_trans = scaler.fit_transform(X)
# NOTE: for such a small sample size, what we expect in the third column
# depends HEAVILY on the method used to calculate quantiles. The values
# here were calculated to fit the quantiles produces by np.percentile
# using numpy 1.9 Calculating quantiles with
# scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles
# would yield very different results!
X_expected = [[0., 0., +0.0],
[0., 0., -1.0],
[0., 0., +1.0]]
assert_array_almost_equal(X_trans, X_expected)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
# make sure new data gets transformed correctly
X_new = [[+0., 2., 0.5],
[-1., 1., 0.0],
[+0., 1., 1.5]]
X_trans_new = scaler.transform(X_new)
X_expected_new = [[+0., 1., +0.],
[-1., 0., -0.83333],
[+0., 0., +1.66667]]
assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3)
def test_maxabs_scaler_zero_variance_features():
# Check MaxAbsScaler on toy data with zero variance features
X = [[0., 1., +0.5],
[0., 1., -0.3],
[0., 1., +1.5],
[0., 0., +0.0]]
scaler = MaxAbsScaler()
X_trans = scaler.fit_transform(X)
X_expected = [[0., 1., 1.0 / 3.0],
[0., 1., -0.2],
[0., 1., 1.0],
[0., 0., 0.0]]
assert_array_almost_equal(X_trans, X_expected)
X_trans_inv = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X, X_trans_inv)
# make sure new data gets transformed correctly
X_new = [[+0., 2., 0.5],
[-1., 1., 0.0],
[+0., 1., 1.5]]
X_trans_new = scaler.transform(X_new)
X_expected_new = [[+0., 2.0, 1.0 / 3.0],
[-1., 1.0, 0.0],
[+0., 1.0, 1.0]]
assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2)
# function interface
X_trans = maxabs_scale(X)
assert_array_almost_equal(X_trans, X_expected)
# sparse data
X_csr = sparse.csr_matrix(X)
X_csc = sparse.csc_matrix(X)
X_trans_csr = scaler.fit_transform(X_csr)
X_trans_csc = scaler.fit_transform(X_csc)
X_expected = [[0., 1., 1.0 / 3.0],
[0., 1., -0.2],
[0., 1., 1.0],
[0., 0., 0.0]]
assert_array_almost_equal(X_trans_csr.A, X_expected)
assert_array_almost_equal(X_trans_csc.A, X_expected)
X_trans_csr_inv = scaler.inverse_transform(X_trans_csr)
X_trans_csc_inv = scaler.inverse_transform(X_trans_csc)
assert_array_almost_equal(X, X_trans_csr_inv.A)
assert_array_almost_equal(X, X_trans_csc_inv.A)
def test_maxabs_scaler_large_negative_value():
# Check MaxAbsScaler on toy data with a large negative value
X = [[0., 1., +0.5, -1.0],
[0., 1., -0.3, -0.5],
[0., 1., -100.0, 0.0],
[0., 0., +0.0, -2.0]]
scaler = MaxAbsScaler()
X_trans = scaler.fit_transform(X)
X_expected = [[0., 1., 0.005, -0.5],
[0., 1., -0.003, -0.25],
[0., 1., -1.0, 0.0],
[0., 0., 0.0, -1.0]]
assert_array_almost_equal(X_trans, X_expected)
def test_maxabs_scaler_transform_one_row_csr():
# Check MaxAbsScaler on transforming csr matrix with one row
X = sparse.csr_matrix([[0.5, 1., 1.]])
scaler = MaxAbsScaler()
scaler = scaler.fit(X)
X_trans = scaler.transform(X)
X_expected = sparse.csr_matrix([[1., 1., 1.]])
assert_array_almost_equal(X_trans.toarray(), X_expected.toarray())
X_scaled_back = scaler.inverse_transform(X_trans)
assert_array_almost_equal(X.toarray(), X_scaled_back.toarray())
def test_warning_scaling_integers():
# Check warning when scaling integer data
X = np.array([[1, 2, 0],
[0, 0, 0]], dtype=np.uint8)
w = "Data with input dtype uint8 was converted to float64"
clean_warning_registry()
assert_warns_message(DataConversionWarning, w, scale, X)
assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X)
assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
def test_maxabs_scaler_1d():
# Test scaling of dataset along single axis
for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
scaler = MaxAbsScaler(copy=True)
X_scaled = scaler.fit(X).transform(X)
if isinstance(X, list):
X = np.array(X) # cast only after scaling done
if _check_dim_1axis(X) == 1:
assert_array_almost_equal(np.abs(X_scaled.max(axis=0)),
np.ones(n_features))
else:
assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.)
assert_equal(scaler.n_samples_seen_, X.shape[0])
# check inverse transform
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_array_almost_equal(X_scaled_back, X)
# Constant feature
X = np.ones(5).reshape(5, 1)
scaler = MaxAbsScaler()
X_scaled = scaler.fit(X).transform(X)
assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.)
assert_equal(scaler.n_samples_seen_, X.shape[0])
# function interface
X_1d = X_1row.ravel()
max_abs = np.abs(X_1d).max()
assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True))
def test_maxabs_scaler_partial_fit():
# Test if partial_fit run over many batches of size 1 and 50
# gives the same results as fit
X = X_2d[:100, :]
n = X.shape[0]
for chunk_size in [1, 2, 50, n, n + 42]:
# Test mean at the end of the process
scaler_batch = MaxAbsScaler().fit(X)
scaler_incr = MaxAbsScaler()
scaler_incr_csr = MaxAbsScaler()
scaler_incr_csc = MaxAbsScaler()
for batch in gen_batches(n, chunk_size):
scaler_incr = scaler_incr.partial_fit(X[batch])
X_csr = sparse.csr_matrix(X[batch])
scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
X_csc = sparse.csc_matrix(X[batch])
scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)
assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
assert_array_almost_equal(scaler_batch.max_abs_,
scaler_incr_csr.max_abs_)
assert_array_almost_equal(scaler_batch.max_abs_,
scaler_incr_csc.max_abs_)
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
assert_equal(scaler_batch.n_samples_seen_,
scaler_incr_csr.n_samples_seen_)
assert_equal(scaler_batch.n_samples_seen_,
scaler_incr_csc.n_samples_seen_)
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
assert_array_almost_equal(scaler_batch.transform(X),
scaler_incr.transform(X))
# Test std after 1 step
batch0 = slice(0, chunk_size)
scaler_batch = MaxAbsScaler().fit(X[batch0])
scaler_incr = MaxAbsScaler().partial_fit(X[batch0])
assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_)
assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
assert_array_almost_equal(scaler_batch.transform(X),
scaler_incr.transform(X))
# Test std until the end of partial fits, and
scaler_batch = MaxAbsScaler().fit(X)
scaler_incr = MaxAbsScaler() # Clean estimator
for i, batch in enumerate(gen_batches(n, chunk_size)):
scaler_incr = scaler_incr.partial_fit(X[batch])
assert_correct_incr(i, batch_start=batch.start,
batch_stop=batch.stop, n=n,
chunk_size=chunk_size,
n_samples_seen=scaler_incr.n_samples_seen_)
def test_normalizer_l1():
rng = np.random.RandomState(0)
X_dense = rng.randn(4, 5)
X_sparse_unpruned = sparse.csr_matrix(X_dense)
# set the row number 3 to zero
X_dense[3, :] = 0.0
# set the row number 3 to zero without pruning (can happen in real life)
indptr_3 = X_sparse_unpruned.indptr[3]
indptr_4 = X_sparse_unpruned.indptr[4]
X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
# build the pruned variant using the regular constructor
X_sparse_pruned = sparse.csr_matrix(X_dense)
# check inputs that support the no-copy optim
for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
normalizer = Normalizer(norm='l1', copy=True)
X_norm = normalizer.transform(X)
assert_true(X_norm is not X)
X_norm1 = toarray(X_norm)
normalizer = Normalizer(norm='l1', copy=False)
X_norm = normalizer.transform(X)
assert_true(X_norm is X)
X_norm2 = toarray(X_norm)
for X_norm in (X_norm1, X_norm2):
row_sums = np.abs(X_norm).sum(axis=1)
for i in range(3):
assert_almost_equal(row_sums[i], 1.0)
assert_almost_equal(row_sums[3], 0.0)
# check input for which copy=False won't prevent a copy
for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
X = init(X_dense)
X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)
assert_true(X_norm is not X)
assert_true(isinstance(X_norm, sparse.csr_matrix))
X_norm = toarray(X_norm)
for i in range(3):
assert_almost_equal(row_sums[i], 1.0)
assert_almost_equal(la.norm(X_norm[3]), 0.0)
def test_normalizer_l2():
rng = np.random.RandomState(0)
X_dense = rng.randn(4, 5)
X_sparse_unpruned = sparse.csr_matrix(X_dense)
# set the row number 3 to zero
X_dense[3, :] = 0.0
# set the row number 3 to zero without pruning (can happen in real life)
indptr_3 = X_sparse_unpruned.indptr[3]
indptr_4 = X_sparse_unpruned.indptr[4]
X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
# build the pruned variant using the regular constructor
X_sparse_pruned = sparse.csr_matrix(X_dense)
# check inputs that support the no-copy optim
for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
normalizer = Normalizer(norm='l2', copy=True)
X_norm1 = normalizer.transform(X)
assert_true(X_norm1 is not X)
X_norm1 = toarray(X_norm1)
normalizer = Normalizer(norm='l2', copy=False)
X_norm2 = normalizer.transform(X)
assert_true(X_norm2 is X)
X_norm2 = toarray(X_norm2)
for X_norm in (X_norm1, X_norm2):
for i in range(3):
assert_almost_equal(la.norm(X_norm[i]), 1.0)
assert_almost_equal(la.norm(X_norm[3]), 0.0)
# check input for which copy=False won't prevent a copy
for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
X = init(X_dense)
X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)
assert_true(X_norm is not X)
assert_true(isinstance(X_norm, sparse.csr_matrix))
X_norm = toarray(X_norm)
for i in range(3):
assert_almost_equal(la.norm(X_norm[i]), 1.0)
assert_almost_equal(la.norm(X_norm[3]), 0.0)
def test_normalizer_max():
rng = np.random.RandomState(0)
X_dense = rng.randn(4, 5)
X_sparse_unpruned = sparse.csr_matrix(X_dense)
# set the row number 3 to zero
X_dense[3, :] = 0.0
# set the row number 3 to zero without pruning (can happen in real life)
indptr_3 = X_sparse_unpruned.indptr[3]
indptr_4 = X_sparse_unpruned.indptr[4]
X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
# build the pruned variant using the regular constructor
X_sparse_pruned = sparse.csr_matrix(X_dense)
# check inputs that support the no-copy optim
for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
normalizer = Normalizer(norm='max', copy=True)
X_norm1 = normalizer.transform(X)
assert_true(X_norm1 is not X)
X_norm1 = toarray(X_norm1)
normalizer = Normalizer(norm='max', copy=False)
X_norm2 = normalizer.transform(X)
assert_true(X_norm2 is X)
X_norm2 = toarray(X_norm2)
for X_norm in (X_norm1, X_norm2):
row_maxs = X_norm.max(axis=1)
for i in range(3):
assert_almost_equal(row_maxs[i], 1.0)
assert_almost_equal(row_maxs[3], 0.0)
# check input for which copy=False won't prevent a copy
for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
X = init(X_dense)
X_norm = normalizer = Normalizer(norm='l2', copy=False).transform(X)
assert_true(X_norm is not X)
assert_true(isinstance(X_norm, sparse.csr_matrix))
X_norm = toarray(X_norm)
for i in range(3):
assert_almost_equal(row_maxs[i], 1.0)
assert_almost_equal(la.norm(X_norm[3]), 0.0)
def test_normalize():
# Test normalize function
# Only tests functionality not used by the tests for Normalizer.
X = np.random.RandomState(37).randn(3, 2)
assert_array_equal(normalize(X, copy=False),
normalize(X.T, axis=0, copy=False).T)
assert_raises(ValueError, normalize, [[0]], axis=2)
assert_raises(ValueError, normalize, [[0]], norm='l3')
rs = np.random.RandomState(0)
X_dense = rs.randn(10, 5)
X_sparse = sparse.csr_matrix(X_dense)
ones = np.ones((10))
for X in (X_dense, X_sparse):
for dtype in (np.float32, np.float64):
for norm in ('l1', 'l2'):
X = X.astype(dtype)
X_norm = normalize(X, norm=norm)
assert_equal(X_norm.dtype, dtype)
X_norm = toarray(X_norm)
if norm == 'l1':
row_sums = np.abs(X_norm).sum(axis=1)
else:
X_norm_squared = X_norm**2
row_sums = X_norm_squared.sum(axis=1)
assert_array_almost_equal(row_sums, ones)
# Test return_norm
X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]])
for norm in ('l1', 'l2', 'max'):
_, norms = normalize(X_dense, norm=norm, return_norm=True)
if norm == 'l1':
assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0]))
elif norm == 'l2':
assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127]))
else:
assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
X_sparse = sparse.csr_matrix(X_dense)
for norm in ('l1', 'l2'):
assert_raises(NotImplementedError, normalize, X_sparse,
norm=norm, return_norm=True)
_, norms = normalize(X_sparse, norm='max', return_norm=True)
assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
def test_binarizer():
X_ = np.array([[1, 0, 5], [2, 3, -1]])
for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix):
X = init(X_.copy())
binarizer = Binarizer(threshold=2.0, copy=True)
X_bin = toarray(binarizer.transform(X))
assert_equal(np.sum(X_bin == 0), 4)
assert_equal(np.sum(X_bin == 1), 2)
X_bin = binarizer.transform(X)
assert_equal(sparse.issparse(X), sparse.issparse(X_bin))
binarizer = Binarizer(copy=True).fit(X)
X_bin = toarray(binarizer.transform(X))
assert_true(X_bin is not X)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=True)
X_bin = binarizer.transform(X)
assert_true(X_bin is not X)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=False)
X_bin = binarizer.transform(X)
if init is not list:
assert_true(X_bin is X)
binarizer = Binarizer(copy=False)
X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64)
X_bin = binarizer.transform(X_float)
if init is not list:
assert_true(X_bin is X_float)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(threshold=-0.5, copy=True)
for init in (np.array, list):
X = init(X_.copy())
X_bin = toarray(binarizer.transform(X))
assert_equal(np.sum(X_bin == 0), 1)
assert_equal(np.sum(X_bin == 1), 5)
X_bin = binarizer.transform(X)
# Cannot use threshold < 0 for sparse
assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
def test_center_kernel():
# Test that KernelCenterer is equivalent to StandardScaler
# in feature space
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
scaler = StandardScaler(with_std=False)
scaler.fit(X_fit)
X_fit_centered = scaler.transform(X_fit)
K_fit = np.dot(X_fit, X_fit.T)
# center fit time matrix
centerer = KernelCenterer()
K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
K_fit_centered2 = centerer.fit_transform(K_fit)
assert_array_almost_equal(K_fit_centered, K_fit_centered2)
# center predict time matrix
X_pred = rng.random_sample((2, 4))
K_pred = np.dot(X_pred, X_fit.T)
X_pred_centered = scaler.transform(X_pred)
K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
K_pred_centered2 = centerer.transform(K_pred)
assert_array_almost_equal(K_pred_centered, K_pred_centered2)
def test_cv_pipeline_precomputed():
# Cross-validate a regression on four coplanar points with the same
# value. Use precomputed kernel to ensure Pipeline with KernelCenterer
# is treated as a _pairwise operation.
X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
y_true = np.ones((4,))
K = X.dot(X.T)
kcent = KernelCenterer()
pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())])
# did the pipeline set the _pairwise attribute?
assert_true(pipeline._pairwise)
# test cross-validation, score should be almost perfect
# NB: this test is pretty vacuous -- it's mainly to test integration
# of Pipeline and KernelCenterer
y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
assert_array_almost_equal(y_true, y_pred)
def test_fit_transform():
rng = np.random.RandomState(0)
X = rng.random_sample((5, 4))
for obj in ((StandardScaler(), Normalizer(), Binarizer())):
X_transformed = obj.fit(X).transform(X)
X_transformed2 = obj.fit_transform(X)
assert_array_equal(X_transformed, X_transformed2)
def test_add_dummy_feature():
X = [[1, 0], [0, 1], [0, 1]]
X = add_dummy_feature(X)
assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def test_add_dummy_feature_coo():
X = sparse.coo_matrix([[1, 0], [0, 1], [0, 1]])
X = add_dummy_feature(X)
assert_true(sparse.isspmatrix_coo(X), X)
assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def test_add_dummy_feature_csc():
X = sparse.csc_matrix([[1, 0], [0, 1], [0, 1]])
X = add_dummy_feature(X)
assert_true(sparse.isspmatrix_csc(X), X)
assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def test_add_dummy_feature_csr():
X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]])
X = add_dummy_feature(X)
assert_true(sparse.isspmatrix_csr(X), X)
assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
def test_one_hot_encoder_sparse():
# Test OneHotEncoder's fit and transform.
X = [[3, 2, 1], [0, 1, 1]]
enc = OneHotEncoder()
# discover max values automatically
X_trans = enc.fit_transform(X).toarray()
assert_equal(X_trans.shape, (2, 5))
assert_array_equal(enc.active_features_,
np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
# check outcome
assert_array_equal(X_trans,
[[0., 1., 0., 1., 1.],
[1., 0., 1., 0., 1.]])
# max value given as 3
enc = OneHotEncoder(n_values=4)
X_trans = enc.fit_transform(X)
assert_equal(X_trans.shape, (2, 4 * 3))
assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])
# max value given per feature
enc = OneHotEncoder(n_values=[3, 2, 2])
X = [[1, 0, 1], [0, 1, 1]]
X_trans = enc.fit_transform(X)
assert_equal(X_trans.shape, (2, 3 + 2 + 2))
assert_array_equal(enc.n_values_, [3, 2, 2])
# check that testing with larger feature works:
X = np.array([[2, 0, 1], [0, 1, 1]])
enc.transform(X)
# test that an error is raised when out of bounds:
X_too_large = [[0, 2, 1], [0, 1, 1]]
assert_raises(ValueError, enc.transform, X_too_large)
error_msg = "unknown categorical feature present \[2\] during transform."
assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large)
assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)
# test that error is raised when wrong number of features
assert_raises(ValueError, enc.transform, X[:, :-1])
# test that error is raised when wrong number of features in fit
# with prespecified n_values
assert_raises(ValueError, enc.fit, X[:, :-1])
# test exception on wrong init param
assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)
enc = OneHotEncoder()
# test negative input to fit
assert_raises(ValueError, enc.fit, [[0], [-1]])
# test negative input to transform
enc.fit([[0], [1]])
assert_raises(ValueError, enc.transform, [[0], [-1]])
def test_one_hot_encoder_dense():
# check for sparse=False
X = [[3, 2, 1], [0, 1, 1]]
enc = OneHotEncoder(sparse=False)
# discover max values automatically
X_trans = enc.fit_transform(X)
assert_equal(X_trans.shape, (2, 5))
assert_array_equal(enc.active_features_,
np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])
# check outcome
assert_array_equal(X_trans,
np.array([[0., 1., 0., 1., 1.],
[1., 0., 1., 0., 1.]]))
def _check_transform_selected(X, X_expected, sel):
for M in (X, sparse.csr_matrix(X)):
Xtr = _transform_selected(M, Binarizer().transform, sel)
assert_array_equal(toarray(Xtr), X_expected)
def test_transform_selected():
X = [[3, 2, 1], [0, 1, 1]]
X_expected = [[1, 2, 1], [0, 1, 1]]
_check_transform_selected(X, X_expected, [0])
_check_transform_selected(X, X_expected, [True, False, False])
X_expected = [[1, 1, 1], [0, 1, 1]]
_check_transform_selected(X, X_expected, [0, 1, 2])
_check_transform_selected(X, X_expected, [True, True, True])
_check_transform_selected(X, X_expected, "all")
_check_transform_selected(X, X, [])
_check_transform_selected(X, X, [False, False, False])
def test_transform_selected_copy_arg():
# transformer that alters X
def _mutating_transformer(X):
X[0, 0] = X[0, 0] + 1
return X
original_X = np.asarray([[1, 2], [3, 4]])
expected_Xtr = [[2, 2], [3, 4]]
X = original_X.copy()
Xtr = _transform_selected(X, _mutating_transformer, copy=True,
selected='all')
assert_array_equal(toarray(X), toarray(original_X))
assert_array_equal(toarray(Xtr), expected_Xtr)
def _run_one_hot(X, X2, cat):
enc = OneHotEncoder(categorical_features=cat)
Xtr = enc.fit_transform(X)
X2tr = enc.transform(X2)
return Xtr, X2tr
def _check_one_hot(X, X2, cat, n_features):
ind = np.where(cat)[0]
# With mask
A, B = _run_one_hot(X, X2, cat)
# With indices
C, D = _run_one_hot(X, X2, ind)
# Check shape
assert_equal(A.shape, (2, n_features))
assert_equal(B.shape, (1, n_features))
assert_equal(C.shape, (2, n_features))
assert_equal(D.shape, (1, n_features))
# Check that mask and indices give the same results
assert_array_equal(toarray(A), toarray(C))
assert_array_equal(toarray(B), toarray(D))
def test_one_hot_encoder_categorical_features():
X = np.array([[3, 2, 1], [0, 1, 1]])
X2 = np.array([[1, 1, 1]])
cat = [True, False, False]
_check_one_hot(X, X2, cat, 4)
# Edge case: all non-categorical
cat = [False, False, False]
_check_one_hot(X, X2, cat, 3)
# Edge case: all categorical
cat = [True, True, True]
_check_one_hot(X, X2, cat, 5)
def test_one_hot_encoder_unknown_transform():
X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
y = np.array([[4, 1, 1]])
# Test that one hot encoder raises error for unknown features
# present during transform.
oh = OneHotEncoder(handle_unknown='error')
oh.fit(X)
assert_raises(ValueError, oh.transform, y)
# Test the ignore option, ignores unknown features.
oh = OneHotEncoder(handle_unknown='ignore')
oh.fit(X)
assert_array_equal(
oh.transform(y).toarray(),
np.array([[0., 0., 0., 0., 1., 0., 0.]]))
# Raise error if handle_unknown is neither ignore or error.
oh = OneHotEncoder(handle_unknown='42')
oh.fit(X)
assert_raises(ValueError, oh.transform, y)
def test_fit_cold_start():
X = iris.data
X_2d = X[:, :2]
# Scalers that have a partial_fit method
scalers = [StandardScaler(with_mean=False, with_std=False),
MinMaxScaler(),
MaxAbsScaler()]
for scaler in scalers:
scaler.fit_transform(X)
# with a different shape, this may break the scaler unless the internal
# state is reset
scaler.fit_transform(X_2d)
def test_quantile_transform_valid_axis():
X = np.array([[0, 25, 50, 75, 100],
[2, 4, 6, 8, 10],
[2.6, 4.1, 2.3, 9.5, 0.1]])
assert_raises_regex(ValueError, "axis should be either equal to 0 or 1"
". Got axis=2", quantile_transform, X.T, axis=2)