1954 lines
77 KiB
Python
1954 lines
77 KiB
Python
"""Forest of trees-based ensemble methods
|
|
|
|
Those methods include random forests and extremely randomized trees.
|
|
|
|
The module structure is the following:
|
|
|
|
- The ``BaseForest`` base class implements a common ``fit`` method for all
|
|
the estimators in the module. The ``fit`` method of the base ``Forest``
|
|
class calls the ``fit`` method of each sub-estimator on random samples
|
|
(with replacement, a.k.a. bootstrap) of the training set.
|
|
|
|
The init of the sub-estimator is further delegated to the
|
|
``BaseEnsemble`` constructor.
|
|
|
|
- The ``ForestClassifier`` and ``ForestRegressor`` base classes further
|
|
implement the prediction logic by computing an average of the predicted
|
|
outcomes of the sub-estimators.
|
|
|
|
- The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived
|
|
classes provide the user with concrete implementations of
|
|
the forest ensemble method using classical, deterministic
|
|
``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as
|
|
sub-estimator implementations.
|
|
|
|
- The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived
|
|
classes provide the user with concrete implementations of the
|
|
forest ensemble method using the extremely randomized trees
|
|
``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as
|
|
sub-estimator implementations.
|
|
|
|
Single and multi-output problems are both handled.
|
|
|
|
"""
|
|
|
|
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
|
# Brian Holt <bdholt1@gmail.com>
|
|
# Joly Arnaud <arnaud.v.joly@gmail.com>
|
|
# Fares Hedayati <fares.hedayati@gmail.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
from __future__ import division
|
|
|
|
import warnings
|
|
from warnings import warn
|
|
import threading
|
|
|
|
from abc import ABCMeta, abstractmethod
|
|
import numpy as np
|
|
from scipy.sparse import issparse
|
|
from scipy.sparse import hstack as sparse_hstack
|
|
|
|
|
|
from ..base import ClassifierMixin, RegressorMixin
|
|
from ..externals.joblib import Parallel, delayed
|
|
from ..externals import six
|
|
from ..metrics import r2_score
|
|
from ..preprocessing import OneHotEncoder
|
|
from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor,
|
|
ExtraTreeClassifier, ExtraTreeRegressor)
|
|
from ..tree._tree import DTYPE, DOUBLE
|
|
from ..utils import check_random_state, check_array, compute_sample_weight
|
|
from ..exceptions import DataConversionWarning, NotFittedError
|
|
from .base import BaseEnsemble, _partition_estimators
|
|
from ..utils.fixes import parallel_helper
|
|
from ..utils.multiclass import check_classification_targets
|
|
from ..utils.validation import check_is_fitted
|
|
|
|
__all__ = ["RandomForestClassifier",
|
|
"RandomForestRegressor",
|
|
"ExtraTreesClassifier",
|
|
"ExtraTreesRegressor",
|
|
"RandomTreesEmbedding"]
|
|
|
|
MAX_INT = np.iinfo(np.int32).max
|
|
|
|
|
|
def _generate_sample_indices(random_state, n_samples):
|
|
"""Private function used to _parallel_build_trees function."""
|
|
random_instance = check_random_state(random_state)
|
|
sample_indices = random_instance.randint(0, n_samples, n_samples)
|
|
|
|
return sample_indices
|
|
|
|
|
|
def _generate_unsampled_indices(random_state, n_samples):
|
|
"""Private function used to forest._set_oob_score function."""
|
|
sample_indices = _generate_sample_indices(random_state, n_samples)
|
|
sample_counts = np.bincount(sample_indices, minlength=n_samples)
|
|
unsampled_mask = sample_counts == 0
|
|
indices_range = np.arange(n_samples)
|
|
unsampled_indices = indices_range[unsampled_mask]
|
|
|
|
return unsampled_indices
|
|
|
|
|
|
def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
|
|
verbose=0, class_weight=None):
|
|
"""Private function used to fit a single tree in parallel."""
|
|
if verbose > 1:
|
|
print("building tree %d of %d" % (tree_idx + 1, n_trees))
|
|
|
|
if forest.bootstrap:
|
|
n_samples = X.shape[0]
|
|
if sample_weight is None:
|
|
curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
|
|
else:
|
|
curr_sample_weight = sample_weight.copy()
|
|
|
|
indices = _generate_sample_indices(tree.random_state, n_samples)
|
|
sample_counts = np.bincount(indices, minlength=n_samples)
|
|
curr_sample_weight *= sample_counts
|
|
|
|
if class_weight == 'subsample':
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter('ignore', DeprecationWarning)
|
|
curr_sample_weight *= compute_sample_weight('auto', y, indices)
|
|
elif class_weight == 'balanced_subsample':
|
|
curr_sample_weight *= compute_sample_weight('balanced', y, indices)
|
|
|
|
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
|
|
else:
|
|
tree.fit(X, y, sample_weight=sample_weight, check_input=False)
|
|
|
|
return tree
|
|
|
|
|
|
class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)):
|
|
"""Base class for forests of trees.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=10,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
super(BaseForest, self).__init__(
|
|
base_estimator=base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params)
|
|
|
|
self.bootstrap = bootstrap
|
|
self.oob_score = oob_score
|
|
self.n_jobs = n_jobs
|
|
self.random_state = random_state
|
|
self.verbose = verbose
|
|
self.warm_start = warm_start
|
|
self.class_weight = class_weight
|
|
|
|
def apply(self, X):
|
|
"""Apply trees in the forest to X, return leaf indices.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape = [n_samples, n_features]
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
X_leaves : array_like, shape = [n_samples, n_estimators]
|
|
For each datapoint x in X and for each tree in the forest,
|
|
return the index of the leaf x ends up in.
|
|
"""
|
|
X = self._validate_X_predict(X)
|
|
results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
|
|
backend="threading")(
|
|
delayed(parallel_helper)(tree, 'apply', X, check_input=False)
|
|
for tree in self.estimators_)
|
|
|
|
return np.array(results).T
|
|
|
|
def decision_path(self, X):
|
|
"""Return the decision path in the forest
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape = [n_samples, n_features]
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
indicator : sparse csr array, shape = [n_samples, n_nodes]
|
|
Return a node indicator matrix where non zero elements
|
|
indicates that the samples goes through the nodes.
|
|
|
|
n_nodes_ptr : array of size (n_estimators + 1, )
|
|
The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
|
|
gives the indicator value for the i-th estimator.
|
|
|
|
"""
|
|
X = self._validate_X_predict(X)
|
|
indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
|
|
backend="threading")(
|
|
delayed(parallel_helper)(tree, 'decision_path', X,
|
|
check_input=False)
|
|
for tree in self.estimators_)
|
|
|
|
n_nodes = [0]
|
|
n_nodes.extend([i.shape[1] for i in indicators])
|
|
n_nodes_ptr = np.array(n_nodes).cumsum()
|
|
|
|
return sparse_hstack(indicators).tocsr(), n_nodes_ptr
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""Build a forest of trees from the training set (X, y).
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The training input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csc_matrix``.
|
|
|
|
y : array-like, shape = [n_samples] or [n_samples, n_outputs]
|
|
The target values (class labels in classification, real numbers in
|
|
regression).
|
|
|
|
sample_weight : array-like, shape = [n_samples] or None
|
|
Sample weights. If None, then samples are equally weighted. Splits
|
|
that would create child nodes with net zero or negative weight are
|
|
ignored while searching for a split in each node. In the case of
|
|
classification, splits are also ignored if they would result in any
|
|
single class carrying a negative weight in either child node.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns self.
|
|
"""
|
|
# Validate or convert input data
|
|
X = check_array(X, accept_sparse="csc", dtype=DTYPE)
|
|
y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
|
|
if sample_weight is not None:
|
|
sample_weight = check_array(sample_weight, ensure_2d=False)
|
|
if issparse(X):
|
|
# Pre-sort indices to avoid that each individual tree of the
|
|
# ensemble sorts the indices.
|
|
X.sort_indices()
|
|
|
|
# Remap output
|
|
n_samples, self.n_features_ = X.shape
|
|
|
|
y = np.atleast_1d(y)
|
|
if y.ndim == 2 and y.shape[1] == 1:
|
|
warn("A column-vector y was passed when a 1d array was"
|
|
" expected. Please change the shape of y to "
|
|
"(n_samples,), for example using ravel().",
|
|
DataConversionWarning, stacklevel=2)
|
|
|
|
if y.ndim == 1:
|
|
# reshape is necessary to preserve the data contiguity against vs
|
|
# [:, np.newaxis] that does not.
|
|
y = np.reshape(y, (-1, 1))
|
|
|
|
self.n_outputs_ = y.shape[1]
|
|
|
|
y, expanded_class_weight = self._validate_y_class_weight(y)
|
|
|
|
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
|
|
y = np.ascontiguousarray(y, dtype=DOUBLE)
|
|
|
|
if expanded_class_weight is not None:
|
|
if sample_weight is not None:
|
|
sample_weight = sample_weight * expanded_class_weight
|
|
else:
|
|
sample_weight = expanded_class_weight
|
|
|
|
# Check parameters
|
|
self._validate_estimator()
|
|
|
|
if not self.bootstrap and self.oob_score:
|
|
raise ValueError("Out of bag estimation only available"
|
|
" if bootstrap=True")
|
|
|
|
random_state = check_random_state(self.random_state)
|
|
|
|
if not self.warm_start or not hasattr(self, "estimators_"):
|
|
# Free allocated memory, if any
|
|
self.estimators_ = []
|
|
|
|
n_more_estimators = self.n_estimators - len(self.estimators_)
|
|
|
|
if n_more_estimators < 0:
|
|
raise ValueError('n_estimators=%d must be larger or equal to '
|
|
'len(estimators_)=%d when warm_start==True'
|
|
% (self.n_estimators, len(self.estimators_)))
|
|
|
|
elif n_more_estimators == 0:
|
|
warn("Warm-start fitting without increasing n_estimators does not "
|
|
"fit new trees.")
|
|
else:
|
|
if self.warm_start and len(self.estimators_) > 0:
|
|
# We draw from the random state to get the random state we
|
|
# would have got if we hadn't used a warm_start.
|
|
random_state.randint(MAX_INT, size=len(self.estimators_))
|
|
|
|
trees = []
|
|
for i in range(n_more_estimators):
|
|
tree = self._make_estimator(append=False,
|
|
random_state=random_state)
|
|
trees.append(tree)
|
|
|
|
# Parallel loop: we use the threading backend as the Cython code
|
|
# for fitting the trees is internally releasing the Python GIL
|
|
# making threading always more efficient than multiprocessing in
|
|
# that case.
|
|
trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
|
|
backend="threading")(
|
|
delayed(_parallel_build_trees)(
|
|
t, self, X, y, sample_weight, i, len(trees),
|
|
verbose=self.verbose, class_weight=self.class_weight)
|
|
for i, t in enumerate(trees))
|
|
|
|
# Collect newly grown trees
|
|
self.estimators_.extend(trees)
|
|
|
|
if self.oob_score:
|
|
self._set_oob_score(X, y)
|
|
|
|
# Decapsulate classes_ attributes
|
|
if hasattr(self, "classes_") and self.n_outputs_ == 1:
|
|
self.n_classes_ = self.n_classes_[0]
|
|
self.classes_ = self.classes_[0]
|
|
|
|
return self
|
|
|
|
@abstractmethod
|
|
def _set_oob_score(self, X, y):
|
|
"""Calculate out of bag predictions and score."""
|
|
|
|
def _validate_y_class_weight(self, y):
|
|
# Default implementation
|
|
return y, None
|
|
|
|
def _validate_X_predict(self, X):
|
|
"""Validate X whenever one tries to predict, apply, predict_proba"""
|
|
if self.estimators_ is None or len(self.estimators_) == 0:
|
|
raise NotFittedError("Estimator not fitted, "
|
|
"call `fit` before exploiting the model.")
|
|
|
|
return self.estimators_[0]._validate_X_predict(X, check_input=True)
|
|
|
|
@property
|
|
def feature_importances_(self):
|
|
"""Return the feature importances (the higher, the more important the
|
|
feature).
|
|
|
|
Returns
|
|
-------
|
|
feature_importances_ : array, shape = [n_features]
|
|
"""
|
|
check_is_fitted(self, 'estimators_')
|
|
|
|
all_importances = Parallel(n_jobs=self.n_jobs,
|
|
backend="threading")(
|
|
delayed(getattr)(tree, 'feature_importances_')
|
|
for tree in self.estimators_)
|
|
|
|
return sum(all_importances) / len(self.estimators_)
|
|
|
|
|
|
# This is a utility function for joblib's Parallel. It can't go locally in
|
|
# ForestClassifier or ForestRegressor, because joblib complains that it cannot
|
|
# pickle it when placed there.
|
|
|
|
def accumulate_prediction(predict, X, out, lock):
|
|
prediction = predict(X, check_input=False)
|
|
with lock:
|
|
if len(out) == 1:
|
|
out[0] += prediction
|
|
else:
|
|
for i in range(len(out)):
|
|
out[i] += prediction[i]
|
|
|
|
|
|
class ForestClassifier(six.with_metaclass(ABCMeta, BaseForest,
|
|
ClassifierMixin)):
|
|
"""Base class for forest of trees-based classifiers.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=10,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
|
|
super(ForestClassifier, self).__init__(
|
|
base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params,
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight)
|
|
|
|
def _set_oob_score(self, X, y):
|
|
"""Compute out-of-bag score"""
|
|
X = check_array(X, dtype=DTYPE, accept_sparse='csr')
|
|
|
|
n_classes_ = self.n_classes_
|
|
n_samples = y.shape[0]
|
|
|
|
oob_decision_function = []
|
|
oob_score = 0.0
|
|
predictions = []
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions.append(np.zeros((n_samples, n_classes_[k])))
|
|
|
|
for estimator in self.estimators_:
|
|
unsampled_indices = _generate_unsampled_indices(
|
|
estimator.random_state, n_samples)
|
|
p_estimator = estimator.predict_proba(X[unsampled_indices, :],
|
|
check_input=False)
|
|
|
|
if self.n_outputs_ == 1:
|
|
p_estimator = [p_estimator]
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions[k][unsampled_indices, :] += p_estimator[k]
|
|
|
|
for k in range(self.n_outputs_):
|
|
if (predictions[k].sum(axis=1) == 0).any():
|
|
warn("Some inputs do not have OOB scores. "
|
|
"This probably means too few trees were used "
|
|
"to compute any reliable oob estimates.")
|
|
|
|
decision = (predictions[k] /
|
|
predictions[k].sum(axis=1)[:, np.newaxis])
|
|
oob_decision_function.append(decision)
|
|
oob_score += np.mean(y[:, k] ==
|
|
np.argmax(predictions[k], axis=1), axis=0)
|
|
|
|
if self.n_outputs_ == 1:
|
|
self.oob_decision_function_ = oob_decision_function[0]
|
|
else:
|
|
self.oob_decision_function_ = oob_decision_function
|
|
|
|
self.oob_score_ = oob_score / self.n_outputs_
|
|
|
|
def _validate_y_class_weight(self, y):
|
|
check_classification_targets(y)
|
|
|
|
y = np.copy(y)
|
|
expanded_class_weight = None
|
|
|
|
if self.class_weight is not None:
|
|
y_original = np.copy(y)
|
|
|
|
self.classes_ = []
|
|
self.n_classes_ = []
|
|
|
|
y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
|
|
for k in range(self.n_outputs_):
|
|
classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True)
|
|
self.classes_.append(classes_k)
|
|
self.n_classes_.append(classes_k.shape[0])
|
|
y = y_store_unique_indices
|
|
|
|
if self.class_weight is not None:
|
|
valid_presets = ('balanced', 'balanced_subsample')
|
|
if isinstance(self.class_weight, six.string_types):
|
|
if self.class_weight not in valid_presets:
|
|
raise ValueError('Valid presets for class_weight include '
|
|
'"balanced" and "balanced_subsample". Given "%s".'
|
|
% self.class_weight)
|
|
if self.warm_start:
|
|
warn('class_weight presets "balanced" or "balanced_subsample" are '
|
|
'not recommended for warm_start if the fitted data '
|
|
'differs from the full dataset. In order to use '
|
|
'"balanced" weights, use compute_class_weight("balanced", '
|
|
'classes, y). In place of y you can use a large '
|
|
'enough sample of the full training set target to '
|
|
'properly estimate the class frequency '
|
|
'distributions. Pass the resulting weights as the '
|
|
'class_weight parameter.')
|
|
|
|
if (self.class_weight != 'balanced_subsample' or
|
|
not self.bootstrap):
|
|
if self.class_weight == "balanced_subsample":
|
|
class_weight = "balanced"
|
|
else:
|
|
class_weight = self.class_weight
|
|
expanded_class_weight = compute_sample_weight(class_weight,
|
|
y_original)
|
|
|
|
return y, expanded_class_weight
|
|
|
|
def predict(self, X):
|
|
"""Predict class for X.
|
|
|
|
The predicted class of an input sample is a vote by the trees in
|
|
the forest, weighted by their probability estimates. That is,
|
|
the predicted class is the one with highest mean probability
|
|
estimate across the trees.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
y : array of shape = [n_samples] or [n_samples, n_outputs]
|
|
The predicted classes.
|
|
"""
|
|
proba = self.predict_proba(X)
|
|
|
|
if self.n_outputs_ == 1:
|
|
return self.classes_.take(np.argmax(proba, axis=1), axis=0)
|
|
|
|
else:
|
|
n_samples = proba[0].shape[0]
|
|
predictions = np.zeros((n_samples, self.n_outputs_))
|
|
|
|
for k in range(self.n_outputs_):
|
|
predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
|
|
axis=1),
|
|
axis=0)
|
|
|
|
return predictions
|
|
|
|
def predict_proba(self, X):
|
|
"""Predict class probabilities for X.
|
|
|
|
The predicted class probabilities of an input sample are computed as
|
|
the mean predicted class probabilities of the trees in the forest. The
|
|
class probability of a single tree is the fraction of samples of the same
|
|
class in a leaf.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
p : array of shape = [n_samples, n_classes], or a list of n_outputs
|
|
such arrays if n_outputs > 1.
|
|
The class probabilities of the input samples. The order of the
|
|
classes corresponds to that in the attribute `classes_`.
|
|
"""
|
|
check_is_fitted(self, 'estimators_')
|
|
# Check data
|
|
X = self._validate_X_predict(X)
|
|
|
|
# Assign chunk of trees to jobs
|
|
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
|
|
|
|
# avoid storing the output of every estimator by summing them here
|
|
all_proba = [np.zeros((X.shape[0], j), dtype=np.float64)
|
|
for j in np.atleast_1d(self.n_classes_)]
|
|
lock = threading.Lock()
|
|
Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")(
|
|
delayed(accumulate_prediction)(e.predict_proba, X, all_proba, lock)
|
|
for e in self.estimators_)
|
|
|
|
for proba in all_proba:
|
|
proba /= len(self.estimators_)
|
|
|
|
if len(all_proba) == 1:
|
|
return all_proba[0]
|
|
else:
|
|
return all_proba
|
|
|
|
def predict_log_proba(self, X):
|
|
"""Predict class log-probabilities for X.
|
|
|
|
The predicted class log-probabilities of an input sample is computed as
|
|
the log of the mean predicted class probabilities of the trees in the
|
|
forest.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
p : array of shape = [n_samples, n_classes], or a list of n_outputs
|
|
such arrays if n_outputs > 1.
|
|
The class probabilities of the input samples. The order of the
|
|
classes corresponds to that in the attribute `classes_`.
|
|
"""
|
|
proba = self.predict_proba(X)
|
|
|
|
if self.n_outputs_ == 1:
|
|
return np.log(proba)
|
|
|
|
else:
|
|
for k in range(self.n_outputs_):
|
|
proba[k] = np.log(proba[k])
|
|
|
|
return proba
|
|
|
|
|
|
class ForestRegressor(six.with_metaclass(ABCMeta, BaseForest, RegressorMixin)):
|
|
"""Base class for forest of trees-based regressors.
|
|
|
|
Warning: This class should not be used directly. Use derived classes
|
|
instead.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self,
|
|
base_estimator,
|
|
n_estimators=10,
|
|
estimator_params=tuple(),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(ForestRegressor, self).__init__(
|
|
base_estimator,
|
|
n_estimators=n_estimators,
|
|
estimator_params=estimator_params,
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
def predict(self, X):
|
|
"""Predict regression target for X.
|
|
|
|
The predicted regression target of an input sample is computed as the
|
|
mean predicted regression targets of the trees in the forest.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix of shape = [n_samples, n_features]
|
|
The input samples. Internally, its dtype will be converted to
|
|
``dtype=np.float32``. If a sparse matrix is provided, it will be
|
|
converted into a sparse ``csr_matrix``.
|
|
|
|
Returns
|
|
-------
|
|
y : array of shape = [n_samples] or [n_samples, n_outputs]
|
|
The predicted values.
|
|
"""
|
|
check_is_fitted(self, 'estimators_')
|
|
# Check data
|
|
X = self._validate_X_predict(X)
|
|
|
|
# Assign chunk of trees to jobs
|
|
n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
|
|
|
|
# avoid storing the output of every estimator by summing them here
|
|
if self.n_outputs_ > 1:
|
|
y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)
|
|
else:
|
|
y_hat = np.zeros((X.shape[0]), dtype=np.float64)
|
|
|
|
# Parallel loop
|
|
lock = threading.Lock()
|
|
Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")(
|
|
delayed(accumulate_prediction)(e.predict, X, [y_hat], lock)
|
|
for e in self.estimators_)
|
|
|
|
y_hat /= len(self.estimators_)
|
|
|
|
return y_hat
|
|
|
|
def _set_oob_score(self, X, y):
|
|
"""Compute out-of-bag scores"""
|
|
X = check_array(X, dtype=DTYPE, accept_sparse='csr')
|
|
|
|
n_samples = y.shape[0]
|
|
|
|
predictions = np.zeros((n_samples, self.n_outputs_))
|
|
n_predictions = np.zeros((n_samples, self.n_outputs_))
|
|
|
|
for estimator in self.estimators_:
|
|
unsampled_indices = _generate_unsampled_indices(
|
|
estimator.random_state, n_samples)
|
|
p_estimator = estimator.predict(
|
|
X[unsampled_indices, :], check_input=False)
|
|
|
|
if self.n_outputs_ == 1:
|
|
p_estimator = p_estimator[:, np.newaxis]
|
|
|
|
predictions[unsampled_indices, :] += p_estimator
|
|
n_predictions[unsampled_indices, :] += 1
|
|
|
|
if (n_predictions == 0).any():
|
|
warn("Some inputs do not have OOB scores. "
|
|
"This probably means too few trees were used "
|
|
"to compute any reliable oob estimates.")
|
|
n_predictions[n_predictions == 0] = 1
|
|
|
|
predictions /= n_predictions
|
|
self.oob_prediction_ = predictions
|
|
|
|
if self.n_outputs_ == 1:
|
|
self.oob_prediction_ = \
|
|
self.oob_prediction_.reshape((n_samples, ))
|
|
|
|
self.oob_score_ = 0.0
|
|
|
|
for k in range(self.n_outputs_):
|
|
self.oob_score_ += r2_score(y[:, k],
|
|
predictions[:, k])
|
|
|
|
self.oob_score_ /= self.n_outputs_
|
|
|
|
|
|
class RandomForestClassifier(ForestClassifier):
|
|
"""A random forest classifier.
|
|
|
|
A random forest is a meta estimator that fits a number of decision tree
|
|
classifiers on various sub-samples of the dataset and use averaging to
|
|
improve the predictive accuracy and control over-fitting.
|
|
The sub-sample size is always the same as the original
|
|
input sample size but the samples are drawn with replacement if
|
|
`bootstrap=True` (default).
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="gini")
|
|
The function to measure the quality of a split. Supported criteria are
|
|
"gini" for the Gini impurity and "entropy" for the information gain.
|
|
Note: this parameter is tree-specific.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=sqrt(n_features)`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)` (same as "auto").
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int, float, optional (default=2)
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a percentage and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_samples_leaf : int, float, optional (default=1)
|
|
The minimum number of samples required to be at a leaf node:
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a percentage and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_split : float,
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
|
|
Use ``min_impurity_decrease`` instead.
|
|
|
|
min_impurity_decrease : float, optional (default=0.)
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
bootstrap : boolean, optional (default=True)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
oob_score : bool (default=False)
|
|
Whether to use out-of-bag samples to estimate
|
|
the generalization accuracy.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
class_weight : dict, list of dicts, "balanced",
|
|
"balanced_subsample" or None, optional (default=None)
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one. For
|
|
multi-output problems, a list of dicts can be provided in the same
|
|
order as the columns of y.
|
|
|
|
Note that for multioutput (including multilabel) weights should be
|
|
defined for each class of every column in its own dict. For example,
|
|
for four-class multilabel classification weights should be
|
|
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
|
|
[{1:1}, {2:5}, {3:1}, {4:1}].
|
|
|
|
The "balanced" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data
|
|
as ``n_samples / (n_classes * np.bincount(y))``
|
|
|
|
The "balanced_subsample" mode is the same as "balanced" except that
|
|
weights are computed based on the bootstrap sample for every tree
|
|
grown.
|
|
|
|
For multi-output, the weights of each column of y will be multiplied.
|
|
|
|
Note that these weights will be multiplied with sample_weight (passed
|
|
through the fit method) if sample_weight is specified.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
classes_ : array of shape = [n_classes] or a list of such arrays
|
|
The classes labels (single output problem), or a list of arrays of
|
|
class labels (multi-output problem).
|
|
|
|
n_classes_ : int or list
|
|
The number of classes (single output problem), or a list containing the
|
|
number of classes for each output (multi-output problem).
|
|
|
|
n_features_ : int
|
|
The number of features when ``fit`` is performed.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs when ``fit`` is performed.
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_decision_function_ : array of shape = [n_samples, n_classes]
|
|
Decision function computed with out-of-bag estimate on the training
|
|
set. If n_estimators is small it might be possible that a data point
|
|
was never left out during the bootstrap. In this case,
|
|
`oob_decision_function_` might contain NaN.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.ensemble import RandomForestClassifier
|
|
>>> from sklearn.datasets import make_classification
|
|
>>>
|
|
>>> X, y = make_classification(n_samples=1000, n_features=4,
|
|
... n_informative=2, n_redundant=0,
|
|
... random_state=0, shuffle=False)
|
|
>>> clf = RandomForestClassifier(max_depth=2, random_state=0)
|
|
>>> clf.fit(X, y)
|
|
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
|
|
max_depth=2, max_features='auto', max_leaf_nodes=None,
|
|
min_impurity_decrease=0.0, min_impurity_split=None,
|
|
min_samples_leaf=1, min_samples_split=2,
|
|
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
|
|
oob_score=False, random_state=0, verbose=0, warm_start=False)
|
|
>>> print(clf.feature_importances_)
|
|
[ 0.17287856 0.80608704 0.01884792 0.00218648]
|
|
>>> print(clf.predict([[0, 0, 0, 0]]))
|
|
[1]
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
The features are always randomly permuted at each split. Therefore,
|
|
the best found split may vary, even with the same training data,
|
|
``max_features=n_features`` and ``bootstrap=False``, if the improvement
|
|
of the criterion is identical for several splits enumerated during the
|
|
search of the best split. To obtain a deterministic behaviour during
|
|
fitting, ``random_state`` has to be fixed.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
|
|
|
|
See also
|
|
--------
|
|
DecisionTreeClassifier, ExtraTreesClassifier
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="gini",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=True,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
super(RandomForestClassifier, self).__init__(
|
|
base_estimator=DecisionTreeClassifier(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
|
|
|
|
class RandomForestRegressor(ForestRegressor):
|
|
"""A random forest regressor.
|
|
|
|
A random forest is a meta estimator that fits a number of classifying
|
|
decision trees on various sub-samples of the dataset and use averaging
|
|
to improve the predictive accuracy and control over-fitting.
|
|
The sub-sample size is always the same as the original
|
|
input sample size but the samples are drawn with replacement if
|
|
`bootstrap=True` (default).
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="mse")
|
|
The function to measure the quality of a split. Supported criteria
|
|
are "mse" for the mean squared error, which is equal to variance
|
|
reduction as feature selection criterion, and "mae" for the mean
|
|
absolute error.
|
|
|
|
.. versionadded:: 0.18
|
|
Mean Absolute Error (MAE) criterion.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=n_features`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int, float, optional (default=2)
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a percentage and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_samples_leaf : int, float, optional (default=1)
|
|
The minimum number of samples required to be at a leaf node:
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a percentage and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_split : float,
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
|
|
Use ``min_impurity_decrease`` instead.
|
|
|
|
min_impurity_decrease : float, optional (default=0.)
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
bootstrap : boolean, optional (default=True)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
oob_score : bool, optional (default=False)
|
|
whether to use out-of-bag samples to estimate
|
|
the R^2 on unseen data.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeRegressor
|
|
The collection of fitted sub-estimators.
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
n_features_ : int
|
|
The number of features when ``fit`` is performed.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs when ``fit`` is performed.
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_prediction_ : array of shape = [n_samples]
|
|
Prediction computed with out-of-bag estimate on the training set.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.ensemble import RandomForestRegressor
|
|
>>> from sklearn.datasets import make_regression
|
|
>>>
|
|
>>> X, y = make_regression(n_features=4, n_informative=2,
|
|
... random_state=0, shuffle=False)
|
|
>>> regr = RandomForestRegressor(max_depth=2, random_state=0)
|
|
>>> regr.fit(X, y)
|
|
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
|
|
max_features='auto', max_leaf_nodes=None,
|
|
min_impurity_decrease=0.0, min_impurity_split=None,
|
|
min_samples_leaf=1, min_samples_split=2,
|
|
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
|
|
oob_score=False, random_state=0, verbose=0, warm_start=False)
|
|
>>> print(regr.feature_importances_)
|
|
[ 0.17339552 0.81594114 0. 0.01066333]
|
|
>>> print(regr.predict([[0, 0, 0, 0]]))
|
|
[-2.50699856]
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
The features are always randomly permuted at each split. Therefore,
|
|
the best found split may vary, even with the same training data,
|
|
``max_features=n_features`` and ``bootstrap=False``, if the improvement
|
|
of the criterion is identical for several splits enumerated during the
|
|
search of the best split. To obtain a deterministic behaviour during
|
|
fitting, ``random_state`` has to be fixed.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
|
|
|
|
See also
|
|
--------
|
|
DecisionTreeRegressor, ExtraTreesRegressor
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="mse",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=True,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(RandomForestRegressor, self).__init__(
|
|
base_estimator=DecisionTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
|
|
|
|
class ExtraTreesClassifier(ForestClassifier):
|
|
"""An extra-trees classifier.
|
|
|
|
This class implements a meta estimator that fits a number of
|
|
randomized decision trees (a.k.a. extra-trees) on various sub-samples
|
|
of the dataset and use averaging to improve the predictive accuracy
|
|
and control over-fitting.
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="gini")
|
|
The function to measure the quality of a split. Supported criteria are
|
|
"gini" for the Gini impurity and "entropy" for the information gain.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=sqrt(n_features)`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int, float, optional (default=2)
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a percentage and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_samples_leaf : int, float, optional (default=1)
|
|
The minimum number of samples required to be at a leaf node:
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a percentage and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_split : float,
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
|
|
Use ``min_impurity_decrease`` instead.
|
|
|
|
min_impurity_decrease : float, optional (default=0.)
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
bootstrap : boolean, optional (default=False)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
oob_score : bool, optional (default=False)
|
|
Whether to use out-of-bag samples to estimate
|
|
the generalization accuracy.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
class_weight : dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None)
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one. For
|
|
multi-output problems, a list of dicts can be provided in the same
|
|
order as the columns of y.
|
|
|
|
Note that for multioutput (including multilabel) weights should be
|
|
defined for each class of every column in its own dict. For example,
|
|
for four-class multilabel classification weights should be
|
|
[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
|
|
[{1:1}, {2:5}, {3:1}, {4:1}].
|
|
|
|
The "balanced" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data
|
|
as ``n_samples / (n_classes * np.bincount(y))``
|
|
|
|
The "balanced_subsample" mode is the same as "balanced" except that weights are
|
|
computed based on the bootstrap sample for every tree grown.
|
|
|
|
For multi-output, the weights of each column of y will be multiplied.
|
|
|
|
Note that these weights will be multiplied with sample_weight (passed
|
|
through the fit method) if sample_weight is specified.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
classes_ : array of shape = [n_classes] or a list of such arrays
|
|
The classes labels (single output problem), or a list of arrays of
|
|
class labels (multi-output problem).
|
|
|
|
n_classes_ : int or list
|
|
The number of classes (single output problem), or a list containing the
|
|
number of classes for each output (multi-output problem).
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
n_features_ : int
|
|
The number of features when ``fit`` is performed.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs when ``fit`` is performed.
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_decision_function_ : array of shape = [n_samples, n_classes]
|
|
Decision function computed with out-of-bag estimate on the training
|
|
set. If n_estimators is small it might be possible that a data point
|
|
was never left out during the bootstrap. In this case,
|
|
`oob_decision_function_` might contain NaN.
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
|
|
See also
|
|
--------
|
|
sklearn.tree.ExtraTreeClassifier : Base classifier for this ensemble.
|
|
RandomForestClassifier : Ensemble Classifier based on trees with optimal
|
|
splits.
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="gini",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False,
|
|
class_weight=None):
|
|
super(ExtraTreesClassifier, self).__init__(
|
|
base_estimator=ExtraTreeClassifier(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start,
|
|
class_weight=class_weight)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
|
|
|
|
class ExtraTreesRegressor(ForestRegressor):
|
|
"""An extra-trees regressor.
|
|
|
|
This class implements a meta estimator that fits a number of
|
|
randomized decision trees (a.k.a. extra-trees) on various sub-samples
|
|
of the dataset and use averaging to improve the predictive accuracy
|
|
and control over-fitting.
|
|
|
|
Read more in the :ref:`User Guide <forest>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
The number of trees in the forest.
|
|
|
|
criterion : string, optional (default="mse")
|
|
The function to measure the quality of a split. Supported criteria
|
|
are "mse" for the mean squared error, which is equal to variance
|
|
reduction as feature selection criterion, and "mae" for the mean
|
|
absolute error.
|
|
|
|
.. versionadded:: 0.18
|
|
Mean Absolute Error (MAE) criterion.
|
|
|
|
max_features : int, float, string or None, optional (default="auto")
|
|
The number of features to consider when looking for the best split:
|
|
|
|
- If int, then consider `max_features` features at each split.
|
|
- If float, then `max_features` is a percentage and
|
|
`int(max_features * n_features)` features are considered at each
|
|
split.
|
|
- If "auto", then `max_features=n_features`.
|
|
- If "sqrt", then `max_features=sqrt(n_features)`.
|
|
- If "log2", then `max_features=log2(n_features)`.
|
|
- If None, then `max_features=n_features`.
|
|
|
|
Note: the search for a split does not stop until at least one
|
|
valid partition of the node samples is found, even if it requires to
|
|
effectively inspect more than ``max_features`` features.
|
|
|
|
max_depth : integer or None, optional (default=None)
|
|
The maximum depth of the tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int, float, optional (default=2)
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a percentage and
|
|
`ceil(min_samples_split * n_samples)` are the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_samples_leaf : int, float, optional (default=1)
|
|
The minimum number of samples required to be at a leaf node:
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a percentage and
|
|
`ceil(min_samples_leaf * n_samples)` are the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_split : float,
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
|
|
Use ``min_impurity_decrease`` instead.
|
|
|
|
min_impurity_decrease : float, optional (default=0.)
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
bootstrap : boolean, optional (default=False)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
oob_score : bool, optional (default=False)
|
|
Whether to use out-of-bag samples to estimate the R^2 on unseen data.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeRegressor
|
|
The collection of fitted sub-estimators.
|
|
|
|
feature_importances_ : array of shape = [n_features]
|
|
The feature importances (the higher, the more important the feature).
|
|
|
|
n_features_ : int
|
|
The number of features.
|
|
|
|
n_outputs_ : int
|
|
The number of outputs.
|
|
|
|
oob_score_ : float
|
|
Score of the training dataset obtained using an out-of-bag estimate.
|
|
|
|
oob_prediction_ : array of shape = [n_samples]
|
|
Prediction computed with out-of-bag estimate on the training set.
|
|
|
|
Notes
|
|
-----
|
|
The default values for the parameters controlling the size of the trees
|
|
(e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
|
|
unpruned trees which can potentially be very large on some data sets. To
|
|
reduce memory consumption, the complexity and size of the trees should be
|
|
controlled by setting those parameter values.
|
|
|
|
References
|
|
----------
|
|
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
|
|
See also
|
|
--------
|
|
sklearn.tree.ExtraTreeRegressor: Base estimator for this ensemble.
|
|
RandomForestRegressor: Ensemble regressor using trees with optimal splits.
|
|
"""
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
criterion="mse",
|
|
max_depth=None,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_features="auto",
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(ExtraTreesRegressor, self).__init__(
|
|
base_estimator=ExtraTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state"),
|
|
bootstrap=bootstrap,
|
|
oob_score=oob_score,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
self.criterion = criterion
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = max_features
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
|
|
|
|
class RandomTreesEmbedding(BaseForest):
|
|
"""An ensemble of totally random trees.
|
|
|
|
An unsupervised transformation of a dataset to a high-dimensional
|
|
sparse representation. A datapoint is coded according to which leaf of
|
|
each tree it is sorted into. Using a one-hot encoding of the leaves,
|
|
this leads to a binary coding with as many ones as there are trees in
|
|
the forest.
|
|
|
|
The dimensionality of the resulting representation is
|
|
``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,
|
|
the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.
|
|
|
|
Read more in the :ref:`User Guide <random_trees_embedding>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_estimators : integer, optional (default=10)
|
|
Number of trees in the forest.
|
|
|
|
max_depth : integer, optional (default=5)
|
|
The maximum depth of each tree. If None, then nodes are expanded until
|
|
all leaves are pure or until all leaves contain less than
|
|
min_samples_split samples.
|
|
|
|
min_samples_split : int, float, optional (default=2)
|
|
The minimum number of samples required to split an internal node:
|
|
|
|
- If int, then consider `min_samples_split` as the minimum number.
|
|
- If float, then `min_samples_split` is a percentage and
|
|
`ceil(min_samples_split * n_samples)` is the minimum
|
|
number of samples for each split.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_samples_leaf : int, float, optional (default=1)
|
|
The minimum number of samples required to be at a leaf node:
|
|
|
|
- If int, then consider `min_samples_leaf` as the minimum number.
|
|
- If float, then `min_samples_leaf` is a percentage and
|
|
`ceil(min_samples_leaf * n_samples)` is the minimum
|
|
number of samples for each node.
|
|
|
|
.. versionchanged:: 0.18
|
|
Added float values for percentages.
|
|
|
|
min_weight_fraction_leaf : float, optional (default=0.)
|
|
The minimum weighted fraction of the sum total of weights (of all
|
|
the input samples) required to be at a leaf node. Samples have
|
|
equal weight when sample_weight is not provided.
|
|
|
|
max_leaf_nodes : int or None, optional (default=None)
|
|
Grow trees with ``max_leaf_nodes`` in best-first fashion.
|
|
Best nodes are defined as relative reduction in impurity.
|
|
If None then unlimited number of leaf nodes.
|
|
|
|
min_impurity_split : float,
|
|
Threshold for early stopping in tree growth. A node will split
|
|
if its impurity is above the threshold, otherwise it is a leaf.
|
|
|
|
.. deprecated:: 0.19
|
|
``min_impurity_split`` has been deprecated in favor of
|
|
``min_impurity_decrease`` in 0.19 and will be removed in 0.21.
|
|
Use ``min_impurity_decrease`` instead.
|
|
|
|
min_impurity_decrease : float, optional (default=0.)
|
|
A node will be split if this split induces a decrease of the impurity
|
|
greater than or equal to this value.
|
|
|
|
The weighted impurity decrease equation is the following::
|
|
|
|
N_t / N * (impurity - N_t_R / N_t * right_impurity
|
|
- N_t_L / N_t * left_impurity)
|
|
|
|
where ``N`` is the total number of samples, ``N_t`` is the number of
|
|
samples at the current node, ``N_t_L`` is the number of samples in the
|
|
left child, and ``N_t_R`` is the number of samples in the right child.
|
|
|
|
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
|
|
if ``sample_weight`` is passed.
|
|
|
|
.. versionadded:: 0.19
|
|
|
|
bootstrap : boolean, optional (default=True)
|
|
Whether bootstrap samples are used when building trees.
|
|
|
|
sparse_output : bool, optional (default=True)
|
|
Whether or not to return a sparse CSR matrix, as default behavior,
|
|
or to return a dense array compatible with dense pipeline operators.
|
|
|
|
n_jobs : integer, optional (default=1)
|
|
The number of jobs to run in parallel for both `fit` and `predict`.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
verbose : int, optional (default=0)
|
|
Controls the verbosity of the tree building process.
|
|
|
|
warm_start : bool, optional (default=False)
|
|
When set to ``True``, reuse the solution of the previous call to fit
|
|
and add more estimators to the ensemble, otherwise, just fit a whole
|
|
new forest.
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of DecisionTreeClassifier
|
|
The collection of fitted sub-estimators.
|
|
|
|
References
|
|
----------
|
|
.. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
|
|
Machine Learning, 63(1), 3-42, 2006.
|
|
.. [2] Moosmann, F. and Triggs, B. and Jurie, F. "Fast discriminative
|
|
visual codebooks using randomized clustering forests"
|
|
NIPS 2007
|
|
|
|
"""
|
|
|
|
def __init__(self,
|
|
n_estimators=10,
|
|
max_depth=5,
|
|
min_samples_split=2,
|
|
min_samples_leaf=1,
|
|
min_weight_fraction_leaf=0.,
|
|
max_leaf_nodes=None,
|
|
min_impurity_decrease=0.,
|
|
min_impurity_split=None,
|
|
sparse_output=True,
|
|
n_jobs=1,
|
|
random_state=None,
|
|
verbose=0,
|
|
warm_start=False):
|
|
super(RandomTreesEmbedding, self).__init__(
|
|
base_estimator=ExtraTreeRegressor(),
|
|
n_estimators=n_estimators,
|
|
estimator_params=("criterion", "max_depth", "min_samples_split",
|
|
"min_samples_leaf", "min_weight_fraction_leaf",
|
|
"max_features", "max_leaf_nodes",
|
|
"min_impurity_decrease", "min_impurity_split",
|
|
"random_state"),
|
|
bootstrap=False,
|
|
oob_score=False,
|
|
n_jobs=n_jobs,
|
|
random_state=random_state,
|
|
verbose=verbose,
|
|
warm_start=warm_start)
|
|
|
|
self.criterion = 'mse'
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.min_samples_leaf = min_samples_leaf
|
|
self.min_weight_fraction_leaf = min_weight_fraction_leaf
|
|
self.max_features = 1
|
|
self.max_leaf_nodes = max_leaf_nodes
|
|
self.min_impurity_decrease = min_impurity_decrease
|
|
self.min_impurity_split = min_impurity_split
|
|
self.sparse_output = sparse_output
|
|
|
|
def _set_oob_score(self, X, y):
|
|
raise NotImplementedError("OOB score not supported by tree embedding")
|
|
|
|
def fit(self, X, y=None, sample_weight=None):
|
|
"""Fit estimator.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape=(n_samples, n_features)
|
|
The input samples. Use ``dtype=np.float32`` for maximum
|
|
efficiency. Sparse matrices are also supported, use sparse
|
|
``csc_matrix`` for maximum efficiency.
|
|
|
|
sample_weight : array-like, shape = [n_samples] or None
|
|
Sample weights. If None, then samples are equally weighted. Splits
|
|
that would create child nodes with net zero or negative weight are
|
|
ignored while searching for a split in each node. In the case of
|
|
classification, splits are also ignored if they would result in any
|
|
single class carrying a negative weight in either child node.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns self.
|
|
|
|
"""
|
|
self.fit_transform(X, y, sample_weight=sample_weight)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None, sample_weight=None):
|
|
"""Fit estimator and transform dataset.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape=(n_samples, n_features)
|
|
Input data used to build forests. Use ``dtype=np.float32`` for
|
|
maximum efficiency.
|
|
|
|
sample_weight : array-like, shape = [n_samples] or None
|
|
Sample weights. If None, then samples are equally weighted. Splits
|
|
that would create child nodes with net zero or negative weight are
|
|
ignored while searching for a split in each node. In the case of
|
|
classification, splits are also ignored if they would result in any
|
|
single class carrying a negative weight in either child node.
|
|
|
|
Returns
|
|
-------
|
|
X_transformed : sparse matrix, shape=(n_samples, n_out)
|
|
Transformed dataset.
|
|
"""
|
|
X = check_array(X, accept_sparse=['csc'])
|
|
if issparse(X):
|
|
# Pre-sort indices to avoid that each individual tree of the
|
|
# ensemble sorts the indices.
|
|
X.sort_indices()
|
|
|
|
rnd = check_random_state(self.random_state)
|
|
y = rnd.uniform(size=X.shape[0])
|
|
super(RandomTreesEmbedding, self).fit(X, y,
|
|
sample_weight=sample_weight)
|
|
|
|
self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output)
|
|
return self.one_hot_encoder_.fit_transform(self.apply(X))
|
|
|
|
def transform(self, X):
|
|
"""Transform dataset.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix, shape=(n_samples, n_features)
|
|
Input data to be transformed. Use ``dtype=np.float32`` for maximum
|
|
efficiency. Sparse matrices are also supported, use sparse
|
|
``csr_matrix`` for maximum efficiency.
|
|
|
|
Returns
|
|
-------
|
|
X_transformed : sparse matrix, shape=(n_samples, n_out)
|
|
Transformed dataset.
|
|
"""
|
|
return self.one_hot_encoder_.transform(self.apply(X))
|