You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

887 lines
32 KiB

4 years ago
  1. """
  2. The :mod:`sklearn.pipeline` module implements utilities to build a composite
  3. estimator, as a chain of transforms and estimators.
  4. """
  5. # Author: Edouard Duchesnay
  6. # Gael Varoquaux
  7. # Virgile Fritsch
  8. # Alexandre Gramfort
  9. # Lars Buitinck
  10. # License: BSD
  11. from collections import defaultdict
  12. import numpy as np
  13. from scipy import sparse
  14. from .base import clone, TransformerMixin
  15. from .utils import Parallel, delayed
  16. from .externals import six
  17. from .utils.metaestimators import if_delegate_has_method
  18. from .utils import Bunch
  19. from .utils.validation import check_memory
  20. from .utils.metaestimators import _BaseComposition
  21. __all__ = ['Pipeline', 'FeatureUnion', 'make_pipeline', 'make_union']
  22. class Pipeline(_BaseComposition):
  23. """Pipeline of transforms with a final estimator.
  24. Sequentially apply a list of transforms and a final estimator.
  25. Intermediate steps of the pipeline must be 'transforms', that is, they
  26. must implement fit and transform methods.
  27. The final estimator only needs to implement fit.
  28. The transformers in the pipeline can be cached using ``memory`` argument.
  29. The purpose of the pipeline is to assemble several steps that can be
  30. cross-validated together while setting different parameters.
  31. For this, it enables setting parameters of the various steps using their
  32. names and the parameter name separated by a '__', as in the example below.
  33. A step's estimator may be replaced entirely by setting the parameter
  34. with its name to another estimator, or a transformer removed by setting
  35. to None.
  36. Read more in the :ref:`User Guide <pipeline>`.
  37. Parameters
  38. ----------
  39. steps : list
  40. List of (name, transform) tuples (implementing fit/transform) that are
  41. chained, in the order in which they are chained, with the last object
  42. an estimator.
  43. memory : None, str or object with the joblib.Memory interface, optional
  44. Used to cache the fitted transformers of the pipeline. By default,
  45. no caching is performed. If a string is given, it is the path to
  46. the caching directory. Enabling caching triggers a clone of
  47. the transformers before fitting. Therefore, the transformer
  48. instance given to the pipeline cannot be inspected
  49. directly. Use the attribute ``named_steps`` or ``steps`` to
  50. inspect estimators within the pipeline. Caching the
  51. transformers is advantageous when fitting is time consuming.
  52. Attributes
  53. ----------
  54. named_steps : bunch object, a dictionary with attribute access
  55. Read-only attribute to access any step parameter by user given name.
  56. Keys are step names and values are steps parameters.
  57. See also
  58. --------
  59. sklearn.pipeline.make_pipeline : convenience function for simplified
  60. pipeline construction.
  61. Examples
  62. --------
  63. >>> from sklearn import svm
  64. >>> from sklearn.datasets import samples_generator
  65. >>> from sklearn.feature_selection import SelectKBest
  66. >>> from sklearn.feature_selection import f_regression
  67. >>> from sklearn.pipeline import Pipeline
  68. >>> # generate some data to play with
  69. >>> X, y = samples_generator.make_classification(
  70. ... n_informative=5, n_redundant=0, random_state=42)
  71. >>> # ANOVA SVM-C
  72. >>> anova_filter = SelectKBest(f_regression, k=5)
  73. >>> clf = svm.SVC(kernel='linear')
  74. >>> anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])
  75. >>> # You can set the parameters using the names issued
  76. >>> # For instance, fit using a k of 10 in the SelectKBest
  77. >>> # and a parameter 'C' of the svm
  78. >>> anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
  79. ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
  80. Pipeline(memory=None,
  81. steps=[('anova', SelectKBest(...)),
  82. ('svc', SVC(...))])
  83. >>> prediction = anova_svm.predict(X)
  84. >>> anova_svm.score(X, y) # doctest: +ELLIPSIS
  85. 0.83
  86. >>> # getting the selected features chosen by anova_filter
  87. >>> anova_svm.named_steps['anova'].get_support()
  88. ... # doctest: +NORMALIZE_WHITESPACE
  89. array([False, False, True, True, False, False, True, True, False,
  90. True, False, True, True, False, True, False, True, True,
  91. False, False])
  92. >>> # Another way to get selected features chosen by anova_filter
  93. >>> anova_svm.named_steps.anova.get_support()
  94. ... # doctest: +NORMALIZE_WHITESPACE
  95. array([False, False, True, True, False, False, True, True, False,
  96. True, False, True, True, False, True, False, True, True,
  97. False, False])
  98. """
  99. # BaseEstimator interface
  100. def __init__(self, steps, memory=None):
  101. self.steps = steps
  102. self._validate_steps()
  103. self.memory = memory
  104. def get_params(self, deep=True):
  105. """Get parameters for this estimator.
  106. Parameters
  107. ----------
  108. deep : boolean, optional
  109. If True, will return the parameters for this estimator and
  110. contained subobjects that are estimators.
  111. Returns
  112. -------
  113. params : mapping of string to any
  114. Parameter names mapped to their values.
  115. """
  116. return self._get_params('steps', deep=deep)
  117. def set_params(self, **kwargs):
  118. """Set the parameters of this estimator.
  119. Valid parameter keys can be listed with ``get_params()``.
  120. Returns
  121. -------
  122. self
  123. """
  124. self._set_params('steps', **kwargs)
  125. return self
  126. def _validate_steps(self):
  127. names, estimators = zip(*self.steps)
  128. # validate names
  129. self._validate_names(names)
  130. # validate estimators
  131. transformers = estimators[:-1]
  132. estimator = estimators[-1]
  133. for t in transformers:
  134. if t is None:
  135. continue
  136. if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
  137. hasattr(t, "transform")):
  138. raise TypeError("All intermediate steps should be "
  139. "transformers and implement fit and transform."
  140. " '%s' (type %s) doesn't" % (t, type(t)))
  141. # We allow last estimator to be None as an identity transformation
  142. if estimator is not None and not hasattr(estimator, "fit"):
  143. raise TypeError("Last step of Pipeline should implement fit. "
  144. "'%s' (type %s) doesn't"
  145. % (estimator, type(estimator)))
  146. @property
  147. def _estimator_type(self):
  148. return self.steps[-1][1]._estimator_type
  149. @property
  150. def named_steps(self):
  151. # Use Bunch object to improve autocomplete
  152. return Bunch(**dict(self.steps))
  153. @property
  154. def _final_estimator(self):
  155. return self.steps[-1][1]
  156. # Estimator interface
  157. def _fit(self, X, y=None, **fit_params):
  158. # shallow copy of steps - this should really be steps_
  159. self.steps = list(self.steps)
  160. self._validate_steps()
  161. # Setup the memory
  162. memory = check_memory(self.memory)
  163. fit_transform_one_cached = memory.cache(_fit_transform_one)
  164. fit_params_steps = dict((name, {}) for name, step in self.steps
  165. if step is not None)
  166. for pname, pval in six.iteritems(fit_params):
  167. step, param = pname.split('__', 1)
  168. fit_params_steps[step][param] = pval
  169. Xt = X
  170. for step_idx, (name, transformer) in enumerate(self.steps[:-1]):
  171. if transformer is None:
  172. pass
  173. else:
  174. if hasattr(memory, 'location'):
  175. # joblib >= 0.12
  176. if memory.location is None:
  177. # we do not clone when caching is disabled to
  178. # preserve backward compatibility
  179. cloned_transformer = transformer
  180. else:
  181. cloned_transformer = clone(transformer)
  182. elif hasattr(memory, 'cachedir'):
  183. # joblib < 0.11
  184. if memory.cachedir is None:
  185. # we do not clone when caching is disabled to
  186. # preserve backward compatibility
  187. cloned_transformer = transformer
  188. else:
  189. cloned_transformer = clone(transformer)
  190. else:
  191. cloned_transformer = clone(transformer)
  192. # Fit or load from cache the current transfomer
  193. Xt, fitted_transformer = fit_transform_one_cached(
  194. cloned_transformer, Xt, y, None,
  195. **fit_params_steps[name])
  196. # Replace the transformer of the step with the fitted
  197. # transformer. This is necessary when loading the transformer
  198. # from the cache.
  199. self.steps[step_idx] = (name, fitted_transformer)
  200. if self._final_estimator is None:
  201. return Xt, {}
  202. return Xt, fit_params_steps[self.steps[-1][0]]
  203. def fit(self, X, y=None, **fit_params):
  204. """Fit the model
  205. Fit all the transforms one after the other and transform the
  206. data, then fit the transformed data using the final estimator.
  207. Parameters
  208. ----------
  209. X : iterable
  210. Training data. Must fulfill input requirements of first step of the
  211. pipeline.
  212. y : iterable, default=None
  213. Training targets. Must fulfill label requirements for all steps of
  214. the pipeline.
  215. **fit_params : dict of string -> object
  216. Parameters passed to the ``fit`` method of each step, where
  217. each parameter name is prefixed such that parameter ``p`` for step
  218. ``s`` has key ``s__p``.
  219. Returns
  220. -------
  221. self : Pipeline
  222. This estimator
  223. """
  224. Xt, fit_params = self._fit(X, y, **fit_params)
  225. if self._final_estimator is not None:
  226. self._final_estimator.fit(Xt, y, **fit_params)
  227. return self
  228. def fit_transform(self, X, y=None, **fit_params):
  229. """Fit the model and transform with the final estimator
  230. Fits all the transforms one after the other and transforms the
  231. data, then uses fit_transform on transformed data with the final
  232. estimator.
  233. Parameters
  234. ----------
  235. X : iterable
  236. Training data. Must fulfill input requirements of first step of the
  237. pipeline.
  238. y : iterable, default=None
  239. Training targets. Must fulfill label requirements for all steps of
  240. the pipeline.
  241. **fit_params : dict of string -> object
  242. Parameters passed to the ``fit`` method of each step, where
  243. each parameter name is prefixed such that parameter ``p`` for step
  244. ``s`` has key ``s__p``.
  245. Returns
  246. -------
  247. Xt : array-like, shape = [n_samples, n_transformed_features]
  248. Transformed samples
  249. """
  250. last_step = self._final_estimator
  251. Xt, fit_params = self._fit(X, y, **fit_params)
  252. if hasattr(last_step, 'fit_transform'):
  253. return last_step.fit_transform(Xt, y, **fit_params)
  254. elif last_step is None:
  255. return Xt
  256. else:
  257. return last_step.fit(Xt, y, **fit_params).transform(Xt)
  258. @if_delegate_has_method(delegate='_final_estimator')
  259. def predict(self, X, **predict_params):
  260. """Apply transforms to the data, and predict with the final estimator
  261. Parameters
  262. ----------
  263. X : iterable
  264. Data to predict on. Must fulfill input requirements of first step
  265. of the pipeline.
  266. **predict_params : dict of string -> object
  267. Parameters to the ``predict`` called at the end of all
  268. transformations in the pipeline. Note that while this may be
  269. used to return uncertainties from some models with return_std
  270. or return_cov, uncertainties that are generated by the
  271. transformations in the pipeline are not propagated to the
  272. final estimator.
  273. Returns
  274. -------
  275. y_pred : array-like
  276. """
  277. Xt = X
  278. for name, transform in self.steps[:-1]:
  279. if transform is not None:
  280. Xt = transform.transform(Xt)
  281. return self.steps[-1][-1].predict(Xt, **predict_params)
  282. @if_delegate_has_method(delegate='_final_estimator')
  283. def fit_predict(self, X, y=None, **fit_params):
  284. """Applies fit_predict of last step in pipeline after transforms.
  285. Applies fit_transforms of a pipeline to the data, followed by the
  286. fit_predict method of the final estimator in the pipeline. Valid
  287. only if the final estimator implements fit_predict.
  288. Parameters
  289. ----------
  290. X : iterable
  291. Training data. Must fulfill input requirements of first step of
  292. the pipeline.
  293. y : iterable, default=None
  294. Training targets. Must fulfill label requirements for all steps
  295. of the pipeline.
  296. **fit_params : dict of string -> object
  297. Parameters passed to the ``fit`` method of each step, where
  298. each parameter name is prefixed such that parameter ``p`` for step
  299. ``s`` has key ``s__p``.
  300. Returns
  301. -------
  302. y_pred : array-like
  303. """
  304. Xt, fit_params = self._fit(X, y, **fit_params)
  305. return self.steps[-1][-1].fit_predict(Xt, y, **fit_params)
  306. @if_delegate_has_method(delegate='_final_estimator')
  307. def predict_proba(self, X):
  308. """Apply transforms, and predict_proba of the final estimator
  309. Parameters
  310. ----------
  311. X : iterable
  312. Data to predict on. Must fulfill input requirements of first step
  313. of the pipeline.
  314. Returns
  315. -------
  316. y_proba : array-like, shape = [n_samples, n_classes]
  317. """
  318. Xt = X
  319. for name, transform in self.steps[:-1]:
  320. if transform is not None:
  321. Xt = transform.transform(Xt)
  322. return self.steps[-1][-1].predict_proba(Xt)
  323. @if_delegate_has_method(delegate='_final_estimator')
  324. def decision_function(self, X):
  325. """Apply transforms, and decision_function of the final estimator
  326. Parameters
  327. ----------
  328. X : iterable
  329. Data to predict on. Must fulfill input requirements of first step
  330. of the pipeline.
  331. Returns
  332. -------
  333. y_score : array-like, shape = [n_samples, n_classes]
  334. """
  335. Xt = X
  336. for name, transform in self.steps[:-1]:
  337. if transform is not None:
  338. Xt = transform.transform(Xt)
  339. return self.steps[-1][-1].decision_function(Xt)
  340. @if_delegate_has_method(delegate='_final_estimator')
  341. def predict_log_proba(self, X):
  342. """Apply transforms, and predict_log_proba of the final estimator
  343. Parameters
  344. ----------
  345. X : iterable
  346. Data to predict on. Must fulfill input requirements of first step
  347. of the pipeline.
  348. Returns
  349. -------
  350. y_score : array-like, shape = [n_samples, n_classes]
  351. """
  352. Xt = X
  353. for name, transform in self.steps[:-1]:
  354. if transform is not None:
  355. Xt = transform.transform(Xt)
  356. return self.steps[-1][-1].predict_log_proba(Xt)
  357. @property
  358. def transform(self):
  359. """Apply transforms, and transform with the final estimator
  360. This also works where final estimator is ``None``: all prior
  361. transformations are applied.
  362. Parameters
  363. ----------
  364. X : iterable
  365. Data to transform. Must fulfill input requirements of first step
  366. of the pipeline.
  367. Returns
  368. -------
  369. Xt : array-like, shape = [n_samples, n_transformed_features]
  370. """
  371. # _final_estimator is None or has transform, otherwise attribute error
  372. # XXX: Handling the None case means we can't use if_delegate_has_method
  373. if self._final_estimator is not None:
  374. self._final_estimator.transform
  375. return self._transform
  376. def _transform(self, X):
  377. Xt = X
  378. for name, transform in self.steps:
  379. if transform is not None:
  380. Xt = transform.transform(Xt)
  381. return Xt
  382. @property
  383. def inverse_transform(self):
  384. """Apply inverse transformations in reverse order
  385. All estimators in the pipeline must support ``inverse_transform``.
  386. Parameters
  387. ----------
  388. Xt : array-like, shape = [n_samples, n_transformed_features]
  389. Data samples, where ``n_samples`` is the number of samples and
  390. ``n_features`` is the number of features. Must fulfill
  391. input requirements of last step of pipeline's
  392. ``inverse_transform`` method.
  393. Returns
  394. -------
  395. Xt : array-like, shape = [n_samples, n_features]
  396. """
  397. # raise AttributeError if necessary for hasattr behaviour
  398. # XXX: Handling the None case means we can't use if_delegate_has_method
  399. for name, transform in self.steps:
  400. if transform is not None:
  401. transform.inverse_transform
  402. return self._inverse_transform
  403. def _inverse_transform(self, X):
  404. Xt = X
  405. for name, transform in self.steps[::-1]:
  406. if transform is not None:
  407. Xt = transform.inverse_transform(Xt)
  408. return Xt
  409. @if_delegate_has_method(delegate='_final_estimator')
  410. def score(self, X, y=None, sample_weight=None):
  411. """Apply transforms, and score with the final estimator
  412. Parameters
  413. ----------
  414. X : iterable
  415. Data to predict on. Must fulfill input requirements of first step
  416. of the pipeline.
  417. y : iterable, default=None
  418. Targets used for scoring. Must fulfill label requirements for all
  419. steps of the pipeline.
  420. sample_weight : array-like, default=None
  421. If not None, this argument is passed as ``sample_weight`` keyword
  422. argument to the ``score`` method of the final estimator.
  423. Returns
  424. -------
  425. score : float
  426. """
  427. Xt = X
  428. for name, transform in self.steps[:-1]:
  429. if transform is not None:
  430. Xt = transform.transform(Xt)
  431. score_params = {}
  432. if sample_weight is not None:
  433. score_params['sample_weight'] = sample_weight
  434. return self.steps[-1][-1].score(Xt, y, **score_params)
  435. @property
  436. def classes_(self):
  437. return self.steps[-1][-1].classes_
  438. @property
  439. def _pairwise(self):
  440. # check if first estimator expects pairwise input
  441. return getattr(self.steps[0][1], '_pairwise', False)
  442. def _name_estimators(estimators):
  443. """Generate names for estimators."""
  444. names = [type(estimator).__name__.lower() for estimator in estimators]
  445. namecount = defaultdict(int)
  446. for est, name in zip(estimators, names):
  447. namecount[name] += 1
  448. for k, v in list(six.iteritems(namecount)):
  449. if v == 1:
  450. del namecount[k]
  451. for i in reversed(range(len(estimators))):
  452. name = names[i]
  453. if name in namecount:
  454. names[i] += "-%d" % namecount[name]
  455. namecount[name] -= 1
  456. return list(zip(names, estimators))
  457. def make_pipeline(*steps, **kwargs):
  458. """Construct a Pipeline from the given estimators.
  459. This is a shorthand for the Pipeline constructor; it does not require, and
  460. does not permit, naming the estimators. Instead, their names will be set
  461. to the lowercase of their types automatically.
  462. Parameters
  463. ----------
  464. *steps : list of estimators.
  465. memory : None, str or object with the joblib.Memory interface, optional
  466. Used to cache the fitted transformers of the pipeline. By default,
  467. no caching is performed. If a string is given, it is the path to
  468. the caching directory. Enabling caching triggers a clone of
  469. the transformers before fitting. Therefore, the transformer
  470. instance given to the pipeline cannot be inspected
  471. directly. Use the attribute ``named_steps`` or ``steps`` to
  472. inspect estimators within the pipeline. Caching the
  473. transformers is advantageous when fitting is time consuming.
  474. See also
  475. --------
  476. sklearn.pipeline.Pipeline : Class for creating a pipeline of
  477. transforms with a final estimator.
  478. Examples
  479. --------
  480. >>> from sklearn.naive_bayes import GaussianNB
  481. >>> from sklearn.preprocessing import StandardScaler
  482. >>> make_pipeline(StandardScaler(), GaussianNB(priors=None))
  483. ... # doctest: +NORMALIZE_WHITESPACE
  484. Pipeline(memory=None,
  485. steps=[('standardscaler',
  486. StandardScaler(copy=True, with_mean=True, with_std=True)),
  487. ('gaussiannb',
  488. GaussianNB(priors=None, var_smoothing=1e-09))])
  489. Returns
  490. -------
  491. p : Pipeline
  492. """
  493. memory = kwargs.pop('memory', None)
  494. if kwargs:
  495. raise TypeError('Unknown keyword arguments: "{}"'
  496. .format(list(kwargs.keys())[0]))
  497. return Pipeline(_name_estimators(steps), memory=memory)
  498. # weight and fit_params are not used but it allows _fit_one_transformer,
  499. # _transform_one and _fit_transform_one to have the same signature to
  500. # factorize the code in ColumnTransformer
  501. def _fit_one_transformer(transformer, X, y, weight=None, **fit_params):
  502. return transformer.fit(X, y)
  503. def _transform_one(transformer, X, y, weight, **fit_params):
  504. res = transformer.transform(X)
  505. # if we have a weight for this transformer, multiply output
  506. if weight is None:
  507. return res
  508. return res * weight
  509. def _fit_transform_one(transformer, X, y, weight, **fit_params):
  510. if hasattr(transformer, 'fit_transform'):
  511. res = transformer.fit_transform(X, y, **fit_params)
  512. else:
  513. res = transformer.fit(X, y, **fit_params).transform(X)
  514. # if we have a weight for this transformer, multiply output
  515. if weight is None:
  516. return res, transformer
  517. return res * weight, transformer
  518. class FeatureUnion(_BaseComposition, TransformerMixin):
  519. """Concatenates results of multiple transformer objects.
  520. This estimator applies a list of transformer objects in parallel to the
  521. input data, then concatenates the results. This is useful to combine
  522. several feature extraction mechanisms into a single transformer.
  523. Parameters of the transformers may be set using its name and the parameter
  524. name separated by a '__'. A transformer may be replaced entirely by
  525. setting the parameter with its name to another transformer,
  526. or removed by setting to 'drop' or ``None``.
  527. Read more in the :ref:`User Guide <feature_union>`.
  528. Parameters
  529. ----------
  530. transformer_list : list of (string, transformer) tuples
  531. List of transformer objects to be applied to the data. The first
  532. half of each tuple is the name of the transformer.
  533. n_jobs : int or None, optional (default=None)
  534. Number of jobs to run in parallel.
  535. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  536. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  537. for more details.
  538. transformer_weights : dict, optional
  539. Multiplicative weights for features per transformer.
  540. Keys are transformer names, values the weights.
  541. See also
  542. --------
  543. sklearn.pipeline.make_union : convenience function for simplified
  544. feature union construction.
  545. Examples
  546. --------
  547. >>> from sklearn.pipeline import FeatureUnion
  548. >>> from sklearn.decomposition import PCA, TruncatedSVD
  549. >>> union = FeatureUnion([("pca", PCA(n_components=1)),
  550. ... ("svd", TruncatedSVD(n_components=2))])
  551. >>> X = [[0., 1., 3], [2., 2., 5]]
  552. >>> union.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
  553. array([[ 1.5 , 3.0..., 0.8...],
  554. [-1.5 , 5.7..., -0.4...]])
  555. """
  556. def __init__(self, transformer_list, n_jobs=None,
  557. transformer_weights=None):
  558. self.transformer_list = transformer_list
  559. self.n_jobs = n_jobs
  560. self.transformer_weights = transformer_weights
  561. self._validate_transformers()
  562. def get_params(self, deep=True):
  563. """Get parameters for this estimator.
  564. Parameters
  565. ----------
  566. deep : boolean, optional
  567. If True, will return the parameters for this estimator and
  568. contained subobjects that are estimators.
  569. Returns
  570. -------
  571. params : mapping of string to any
  572. Parameter names mapped to their values.
  573. """
  574. return self._get_params('transformer_list', deep=deep)
  575. def set_params(self, **kwargs):
  576. """Set the parameters of this estimator.
  577. Valid parameter keys can be listed with ``get_params()``.
  578. Returns
  579. -------
  580. self
  581. """
  582. self._set_params('transformer_list', **kwargs)
  583. return self
  584. def _validate_transformers(self):
  585. names, transformers = zip(*self.transformer_list)
  586. # validate names
  587. self._validate_names(names)
  588. # validate estimators
  589. for t in transformers:
  590. if t is None or t == 'drop':
  591. continue
  592. if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
  593. hasattr(t, "transform")):
  594. raise TypeError("All estimators should implement fit and "
  595. "transform. '%s' (type %s) doesn't" %
  596. (t, type(t)))
  597. def _iter(self):
  598. """
  599. Generate (name, trans, weight) tuples excluding None and
  600. 'drop' transformers.
  601. """
  602. get_weight = (self.transformer_weights or {}).get
  603. return ((name, trans, get_weight(name))
  604. for name, trans in self.transformer_list
  605. if trans is not None and trans != 'drop')
  606. def get_feature_names(self):
  607. """Get feature names from all transformers.
  608. Returns
  609. -------
  610. feature_names : list of strings
  611. Names of the features produced by transform.
  612. """
  613. feature_names = []
  614. for name, trans, weight in self._iter():
  615. if not hasattr(trans, 'get_feature_names'):
  616. raise AttributeError("Transformer %s (type %s) does not "
  617. "provide get_feature_names."
  618. % (str(name), type(trans).__name__))
  619. feature_names.extend([name + "__" + f for f in
  620. trans.get_feature_names()])
  621. return feature_names
  622. def fit(self, X, y=None):
  623. """Fit all transformers using X.
  624. Parameters
  625. ----------
  626. X : iterable or array-like, depending on transformers
  627. Input data, used to fit transformers.
  628. y : array-like, shape (n_samples, ...), optional
  629. Targets for supervised learning.
  630. Returns
  631. -------
  632. self : FeatureUnion
  633. This estimator
  634. """
  635. self.transformer_list = list(self.transformer_list)
  636. self._validate_transformers()
  637. transformers = Parallel(n_jobs=self.n_jobs)(
  638. delayed(_fit_one_transformer)(trans, X, y)
  639. for _, trans, _ in self._iter())
  640. self._update_transformer_list(transformers)
  641. return self
  642. def fit_transform(self, X, y=None, **fit_params):
  643. """Fit all transformers, transform the data and concatenate results.
  644. Parameters
  645. ----------
  646. X : iterable or array-like, depending on transformers
  647. Input data to be transformed.
  648. y : array-like, shape (n_samples, ...), optional
  649. Targets for supervised learning.
  650. Returns
  651. -------
  652. X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
  653. hstack of results of transformers. sum_n_components is the
  654. sum of n_components (output dimension) over transformers.
  655. """
  656. self._validate_transformers()
  657. result = Parallel(n_jobs=self.n_jobs)(
  658. delayed(_fit_transform_one)(trans, X, y, weight,
  659. **fit_params)
  660. for name, trans, weight in self._iter())
  661. if not result:
  662. # All transformers are None
  663. return np.zeros((X.shape[0], 0))
  664. Xs, transformers = zip(*result)
  665. self._update_transformer_list(transformers)
  666. if any(sparse.issparse(f) for f in Xs):
  667. Xs = sparse.hstack(Xs).tocsr()
  668. else:
  669. Xs = np.hstack(Xs)
  670. return Xs
  671. def transform(self, X):
  672. """Transform X separately by each transformer, concatenate results.
  673. Parameters
  674. ----------
  675. X : iterable or array-like, depending on transformers
  676. Input data to be transformed.
  677. Returns
  678. -------
  679. X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
  680. hstack of results of transformers. sum_n_components is the
  681. sum of n_components (output dimension) over transformers.
  682. """
  683. Xs = Parallel(n_jobs=self.n_jobs)(
  684. delayed(_transform_one)(trans, X, None, weight)
  685. for name, trans, weight in self._iter())
  686. if not Xs:
  687. # All transformers are None
  688. return np.zeros((X.shape[0], 0))
  689. if any(sparse.issparse(f) for f in Xs):
  690. Xs = sparse.hstack(Xs).tocsr()
  691. else:
  692. Xs = np.hstack(Xs)
  693. return Xs
  694. def _update_transformer_list(self, transformers):
  695. transformers = iter(transformers)
  696. self.transformer_list[:] = [(name, old if old is None or old == 'drop'
  697. else next(transformers))
  698. for name, old in self.transformer_list]
  699. def make_union(*transformers, **kwargs):
  700. """Construct a FeatureUnion from the given transformers.
  701. This is a shorthand for the FeatureUnion constructor; it does not require,
  702. and does not permit, naming the transformers. Instead, they will be given
  703. names automatically based on their types. It also does not allow weighting.
  704. Parameters
  705. ----------
  706. *transformers : list of estimators
  707. n_jobs : int or None, optional (default=None)
  708. Number of jobs to run in parallel.
  709. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  710. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  711. for more details.
  712. Returns
  713. -------
  714. f : FeatureUnion
  715. See also
  716. --------
  717. sklearn.pipeline.FeatureUnion : Class for concatenating the results
  718. of multiple transformer objects.
  719. Examples
  720. --------
  721. >>> from sklearn.decomposition import PCA, TruncatedSVD
  722. >>> from sklearn.pipeline import make_union
  723. >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE
  724. FeatureUnion(n_jobs=None,
  725. transformer_list=[('pca',
  726. PCA(copy=True, iterated_power='auto',
  727. n_components=None, random_state=None,
  728. svd_solver='auto', tol=0.0, whiten=False)),
  729. ('truncatedsvd',
  730. TruncatedSVD(algorithm='randomized',
  731. n_components=2, n_iter=5,
  732. random_state=None, tol=0.0))],
  733. transformer_weights=None)
  734. """
  735. n_jobs = kwargs.pop('n_jobs', None)
  736. if kwargs:
  737. # We do not currently support `transformer_weights` as we may want to
  738. # change its type spec in make_union
  739. raise TypeError('Unknown keyword arguments: "{}"'
  740. .format(list(kwargs.keys())[0]))
  741. return FeatureUnion(_name_estimators(transformers), n_jobs=n_jobs)