1411 lines
52 KiB
Python
1411 lines
52 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# Authors: Olivier Grisel <olivier.grisel@ensta.org>
|
||
|
# Mathieu Blondel <mathieu@mblondel.org>
|
||
|
# Lars Buitinck
|
||
|
# Robert Layton <robertlayton@gmail.com>
|
||
|
# Jochen Wersdörfer <jochen@wersdoerfer.de>
|
||
|
# Roman Sinayev <roman.sinayev@gmail.com>
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
"""
|
||
|
The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to
|
||
|
build feature vectors from text documents.
|
||
|
"""
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import array
|
||
|
from collections import Mapping, defaultdict
|
||
|
import numbers
|
||
|
from operator import itemgetter
|
||
|
import re
|
||
|
import unicodedata
|
||
|
|
||
|
import numpy as np
|
||
|
import scipy.sparse as sp
|
||
|
|
||
|
from ..base import BaseEstimator, TransformerMixin
|
||
|
from ..externals import six
|
||
|
from ..externals.six.moves import xrange
|
||
|
from ..preprocessing import normalize
|
||
|
from .hashing import FeatureHasher
|
||
|
from .stop_words import ENGLISH_STOP_WORDS
|
||
|
from ..utils.validation import check_is_fitted
|
||
|
|
||
|
__all__ = ['CountVectorizer',
|
||
|
'ENGLISH_STOP_WORDS',
|
||
|
'TfidfTransformer',
|
||
|
'TfidfVectorizer',
|
||
|
'strip_accents_ascii',
|
||
|
'strip_accents_unicode',
|
||
|
'strip_tags']
|
||
|
|
||
|
|
||
|
def strip_accents_unicode(s):
|
||
|
"""Transform accentuated unicode symbols into their simple counterpart
|
||
|
|
||
|
Warning: the python-level loop and join operations make this
|
||
|
implementation 20 times slower than the strip_accents_ascii basic
|
||
|
normalization.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
strip_accents_ascii
|
||
|
Remove accentuated char for any unicode symbol that has a direct
|
||
|
ASCII equivalent.
|
||
|
"""
|
||
|
normalized = unicodedata.normalize('NFKD', s)
|
||
|
if normalized == s:
|
||
|
return s
|
||
|
else:
|
||
|
return ''.join([c for c in normalized if not unicodedata.combining(c)])
|
||
|
|
||
|
|
||
|
def strip_accents_ascii(s):
|
||
|
"""Transform accentuated unicode symbols into ascii or nothing
|
||
|
|
||
|
Warning: this solution is only suited for languages that have a direct
|
||
|
transliteration to ASCII symbols.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
strip_accents_unicode
|
||
|
Remove accentuated char for any unicode symbol.
|
||
|
"""
|
||
|
nkfd_form = unicodedata.normalize('NFKD', s)
|
||
|
return nkfd_form.encode('ASCII', 'ignore').decode('ASCII')
|
||
|
|
||
|
|
||
|
def strip_tags(s):
|
||
|
"""Basic regexp based HTML / XML tag stripper function
|
||
|
|
||
|
For serious HTML/XML preprocessing you should rather use an external
|
||
|
library such as lxml or BeautifulSoup.
|
||
|
"""
|
||
|
return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
|
||
|
|
||
|
|
||
|
def _check_stop_list(stop):
|
||
|
if stop == "english":
|
||
|
return ENGLISH_STOP_WORDS
|
||
|
elif isinstance(stop, six.string_types):
|
||
|
raise ValueError("not a built-in stop list: %s" % stop)
|
||
|
elif stop is None:
|
||
|
return None
|
||
|
else: # assume it's a collection
|
||
|
return frozenset(stop)
|
||
|
|
||
|
|
||
|
class VectorizerMixin(object):
|
||
|
"""Provides common code for text vectorizers (tokenization logic)."""
|
||
|
|
||
|
_white_spaces = re.compile(r"\s\s+")
|
||
|
|
||
|
def decode(self, doc):
|
||
|
"""Decode the input into a string of unicode symbols
|
||
|
|
||
|
The decoding strategy depends on the vectorizer parameters.
|
||
|
"""
|
||
|
if self.input == 'filename':
|
||
|
with open(doc, 'rb') as fh:
|
||
|
doc = fh.read()
|
||
|
|
||
|
elif self.input == 'file':
|
||
|
doc = doc.read()
|
||
|
|
||
|
if isinstance(doc, bytes):
|
||
|
doc = doc.decode(self.encoding, self.decode_error)
|
||
|
|
||
|
if doc is np.nan:
|
||
|
raise ValueError("np.nan is an invalid document, expected byte or "
|
||
|
"unicode string.")
|
||
|
|
||
|
return doc
|
||
|
|
||
|
def _word_ngrams(self, tokens, stop_words=None):
|
||
|
"""Turn tokens into a sequence of n-grams after stop words filtering"""
|
||
|
# handle stop words
|
||
|
if stop_words is not None:
|
||
|
tokens = [w for w in tokens if w not in stop_words]
|
||
|
|
||
|
# handle token n-grams
|
||
|
min_n, max_n = self.ngram_range
|
||
|
if max_n != 1:
|
||
|
original_tokens = tokens
|
||
|
if min_n == 1:
|
||
|
# no need to do any slicing for unigrams
|
||
|
# just iterate through the original tokens
|
||
|
tokens = list(original_tokens)
|
||
|
min_n += 1
|
||
|
else:
|
||
|
tokens = []
|
||
|
|
||
|
n_original_tokens = len(original_tokens)
|
||
|
|
||
|
# bind method outside of loop to reduce overhead
|
||
|
tokens_append = tokens.append
|
||
|
space_join = " ".join
|
||
|
|
||
|
for n in xrange(min_n,
|
||
|
min(max_n + 1, n_original_tokens + 1)):
|
||
|
for i in xrange(n_original_tokens - n + 1):
|
||
|
tokens_append(space_join(original_tokens[i: i + n]))
|
||
|
|
||
|
return tokens
|
||
|
|
||
|
def _char_ngrams(self, text_document):
|
||
|
"""Tokenize text_document into a sequence of character n-grams"""
|
||
|
# normalize white spaces
|
||
|
text_document = self._white_spaces.sub(" ", text_document)
|
||
|
|
||
|
text_len = len(text_document)
|
||
|
min_n, max_n = self.ngram_range
|
||
|
if min_n == 1:
|
||
|
# no need to do any slicing for unigrams
|
||
|
# iterate through the string
|
||
|
ngrams = list(text_document)
|
||
|
min_n += 1
|
||
|
else:
|
||
|
ngrams = []
|
||
|
|
||
|
# bind method outside of loop to reduce overhead
|
||
|
ngrams_append = ngrams.append
|
||
|
|
||
|
for n in xrange(min_n, min(max_n + 1, text_len + 1)):
|
||
|
for i in xrange(text_len - n + 1):
|
||
|
ngrams_append(text_document[i: i + n])
|
||
|
return ngrams
|
||
|
|
||
|
def _char_wb_ngrams(self, text_document):
|
||
|
"""Whitespace sensitive char-n-gram tokenization.
|
||
|
|
||
|
Tokenize text_document into a sequence of character n-grams
|
||
|
operating only inside word boundaries. n-grams at the edges
|
||
|
of words are padded with space."""
|
||
|
# normalize white spaces
|
||
|
text_document = self._white_spaces.sub(" ", text_document)
|
||
|
|
||
|
min_n, max_n = self.ngram_range
|
||
|
ngrams = []
|
||
|
|
||
|
# bind method outside of loop to reduce overhead
|
||
|
ngrams_append = ngrams.append
|
||
|
|
||
|
for w in text_document.split():
|
||
|
w = ' ' + w + ' '
|
||
|
w_len = len(w)
|
||
|
for n in xrange(min_n, max_n + 1):
|
||
|
offset = 0
|
||
|
ngrams_append(w[offset:offset + n])
|
||
|
while offset + n < w_len:
|
||
|
offset += 1
|
||
|
ngrams_append(w[offset:offset + n])
|
||
|
if offset == 0: # count a short word (w_len < n) only once
|
||
|
break
|
||
|
return ngrams
|
||
|
|
||
|
def build_preprocessor(self):
|
||
|
"""Return a function to preprocess the text before tokenization"""
|
||
|
if self.preprocessor is not None:
|
||
|
return self.preprocessor
|
||
|
|
||
|
# unfortunately python functools package does not have an efficient
|
||
|
# `compose` function that would have allowed us to chain a dynamic
|
||
|
# number of functions. However the cost of a lambda call is a few
|
||
|
# hundreds of nanoseconds which is negligible when compared to the
|
||
|
# cost of tokenizing a string of 1000 chars for instance.
|
||
|
noop = lambda x: x
|
||
|
|
||
|
# accent stripping
|
||
|
if not self.strip_accents:
|
||
|
strip_accents = noop
|
||
|
elif callable(self.strip_accents):
|
||
|
strip_accents = self.strip_accents
|
||
|
elif self.strip_accents == 'ascii':
|
||
|
strip_accents = strip_accents_ascii
|
||
|
elif self.strip_accents == 'unicode':
|
||
|
strip_accents = strip_accents_unicode
|
||
|
else:
|
||
|
raise ValueError('Invalid value for "strip_accents": %s' %
|
||
|
self.strip_accents)
|
||
|
|
||
|
if self.lowercase:
|
||
|
return lambda x: strip_accents(x.lower())
|
||
|
else:
|
||
|
return strip_accents
|
||
|
|
||
|
def build_tokenizer(self):
|
||
|
"""Return a function that splits a string into a sequence of tokens"""
|
||
|
if self.tokenizer is not None:
|
||
|
return self.tokenizer
|
||
|
token_pattern = re.compile(self.token_pattern)
|
||
|
return lambda doc: token_pattern.findall(doc)
|
||
|
|
||
|
def get_stop_words(self):
|
||
|
"""Build or fetch the effective stop words list"""
|
||
|
return _check_stop_list(self.stop_words)
|
||
|
|
||
|
def build_analyzer(self):
|
||
|
"""Return a callable that handles preprocessing and tokenization"""
|
||
|
if callable(self.analyzer):
|
||
|
return self.analyzer
|
||
|
|
||
|
preprocess = self.build_preprocessor()
|
||
|
|
||
|
if self.analyzer == 'char':
|
||
|
return lambda doc: self._char_ngrams(preprocess(self.decode(doc)))
|
||
|
|
||
|
elif self.analyzer == 'char_wb':
|
||
|
return lambda doc: self._char_wb_ngrams(
|
||
|
preprocess(self.decode(doc)))
|
||
|
|
||
|
elif self.analyzer == 'word':
|
||
|
stop_words = self.get_stop_words()
|
||
|
tokenize = self.build_tokenizer()
|
||
|
|
||
|
return lambda doc: self._word_ngrams(
|
||
|
tokenize(preprocess(self.decode(doc))), stop_words)
|
||
|
|
||
|
else:
|
||
|
raise ValueError('%s is not a valid tokenization scheme/analyzer' %
|
||
|
self.analyzer)
|
||
|
|
||
|
def _validate_vocabulary(self):
|
||
|
vocabulary = self.vocabulary
|
||
|
if vocabulary is not None:
|
||
|
if isinstance(vocabulary, set):
|
||
|
vocabulary = sorted(vocabulary)
|
||
|
if not isinstance(vocabulary, Mapping):
|
||
|
vocab = {}
|
||
|
for i, t in enumerate(vocabulary):
|
||
|
if vocab.setdefault(t, i) != i:
|
||
|
msg = "Duplicate term in vocabulary: %r" % t
|
||
|
raise ValueError(msg)
|
||
|
vocabulary = vocab
|
||
|
else:
|
||
|
indices = set(six.itervalues(vocabulary))
|
||
|
if len(indices) != len(vocabulary):
|
||
|
raise ValueError("Vocabulary contains repeated indices.")
|
||
|
for i in xrange(len(vocabulary)):
|
||
|
if i not in indices:
|
||
|
msg = ("Vocabulary of size %d doesn't contain index "
|
||
|
"%d." % (len(vocabulary), i))
|
||
|
raise ValueError(msg)
|
||
|
if not vocabulary:
|
||
|
raise ValueError("empty vocabulary passed to fit")
|
||
|
self.fixed_vocabulary_ = True
|
||
|
self.vocabulary_ = dict(vocabulary)
|
||
|
else:
|
||
|
self.fixed_vocabulary_ = False
|
||
|
|
||
|
def _check_vocabulary(self):
|
||
|
"""Check if vocabulary is empty or missing (not fit-ed)"""
|
||
|
msg = "%(name)s - Vocabulary wasn't fitted."
|
||
|
check_is_fitted(self, 'vocabulary_', msg=msg),
|
||
|
|
||
|
if len(self.vocabulary_) == 0:
|
||
|
raise ValueError("Vocabulary is empty")
|
||
|
|
||
|
|
||
|
class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
|
||
|
"""Convert a collection of text documents to a matrix of token occurrences
|
||
|
|
||
|
It turns a collection of text documents into a scipy.sparse matrix holding
|
||
|
token occurrence counts (or binary occurrence information), possibly
|
||
|
normalized as token frequencies if norm='l1' or projected on the euclidean
|
||
|
unit sphere if norm='l2'.
|
||
|
|
||
|
This text vectorizer implementation uses the hashing trick to find the
|
||
|
token string name to feature integer index mapping.
|
||
|
|
||
|
This strategy has several advantages:
|
||
|
|
||
|
- it is very low memory scalable to large datasets as there is no need to
|
||
|
store a vocabulary dictionary in memory
|
||
|
|
||
|
- it is fast to pickle and un-pickle as it holds no state besides the
|
||
|
constructor parameters
|
||
|
|
||
|
- it can be used in a streaming (partial fit) or parallel pipeline as there
|
||
|
is no state computed during fit.
|
||
|
|
||
|
There are also a couple of cons (vs using a CountVectorizer with an
|
||
|
in-memory vocabulary):
|
||
|
|
||
|
- there is no way to compute the inverse transform (from feature indices to
|
||
|
string feature names) which can be a problem when trying to introspect
|
||
|
which features are most important to a model.
|
||
|
|
||
|
- there can be collisions: distinct tokens can be mapped to the same
|
||
|
feature index. However in practice this is rarely an issue if n_features
|
||
|
is large enough (e.g. 2 ** 18 for text classification problems).
|
||
|
|
||
|
- no IDF weighting as this would render the transformer stateful.
|
||
|
|
||
|
The hash function employed is the signed 32-bit version of Murmurhash3.
|
||
|
|
||
|
Read more in the :ref:`User Guide <text_feature_extraction>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
input : string {'filename', 'file', 'content'}
|
||
|
If 'filename', the sequence passed as an argument to fit is
|
||
|
expected to be a list of filenames that need reading to fetch
|
||
|
the raw content to analyze.
|
||
|
|
||
|
If 'file', the sequence items must have a 'read' method (file-like
|
||
|
object) that is called to fetch the bytes in memory.
|
||
|
|
||
|
Otherwise the input is expected to be the sequence strings or
|
||
|
bytes items are expected to be analyzed directly.
|
||
|
|
||
|
encoding : string, default='utf-8'
|
||
|
If bytes or files are given to analyze, this encoding is used to
|
||
|
decode.
|
||
|
|
||
|
decode_error : {'strict', 'ignore', 'replace'}
|
||
|
Instruction on what to do if a byte sequence is given to analyze that
|
||
|
contains characters not of the given `encoding`. By default, it is
|
||
|
'strict', meaning that a UnicodeDecodeError will be raised. Other
|
||
|
values are 'ignore' and 'replace'.
|
||
|
|
||
|
strip_accents : {'ascii', 'unicode', None}
|
||
|
Remove accents during the preprocessing step.
|
||
|
'ascii' is a fast method that only works on characters that have
|
||
|
an direct ASCII mapping.
|
||
|
'unicode' is a slightly slower method that works on any characters.
|
||
|
None (default) does nothing.
|
||
|
|
||
|
analyzer : string, {'word', 'char', 'char_wb'} or callable
|
||
|
Whether the feature should be made of word or character n-grams.
|
||
|
Option 'char_wb' creates character n-grams only from text inside
|
||
|
word boundaries; n-grams at the edges of words are padded with space.
|
||
|
|
||
|
If a callable is passed it is used to extract the sequence of features
|
||
|
out of the raw, unprocessed input.
|
||
|
|
||
|
preprocessor : callable or None (default)
|
||
|
Override the preprocessing (string transformation) stage while
|
||
|
preserving the tokenizing and n-grams generation steps.
|
||
|
|
||
|
tokenizer : callable or None (default)
|
||
|
Override the string tokenization step while preserving the
|
||
|
preprocessing and n-grams generation steps.
|
||
|
Only applies if ``analyzer == 'word'``.
|
||
|
|
||
|
ngram_range : tuple (min_n, max_n), default=(1, 1)
|
||
|
The lower and upper boundary of the range of n-values for different
|
||
|
n-grams to be extracted. All values of n such that min_n <= n <= max_n
|
||
|
will be used.
|
||
|
|
||
|
stop_words : string {'english'}, list, or None (default)
|
||
|
If 'english', a built-in stop word list for English is used.
|
||
|
|
||
|
If a list, that list is assumed to contain stop words, all of which
|
||
|
will be removed from the resulting tokens.
|
||
|
Only applies if ``analyzer == 'word'``.
|
||
|
|
||
|
lowercase : boolean, default=True
|
||
|
Convert all characters to lowercase before tokenizing.
|
||
|
|
||
|
token_pattern : string
|
||
|
Regular expression denoting what constitutes a "token", only used
|
||
|
if ``analyzer == 'word'``. The default regexp selects tokens of 2
|
||
|
or more alphanumeric characters (punctuation is completely ignored
|
||
|
and always treated as a token separator).
|
||
|
|
||
|
n_features : integer, default=(2 ** 20)
|
||
|
The number of features (columns) in the output matrices. Small numbers
|
||
|
of features are likely to cause hash collisions, but large numbers
|
||
|
will cause larger coefficient dimensions in linear learners.
|
||
|
|
||
|
norm : 'l1', 'l2' or None, optional
|
||
|
Norm used to normalize term vectors. None for no normalization.
|
||
|
|
||
|
binary : boolean, default=False.
|
||
|
If True, all non zero counts are set to 1. This is useful for discrete
|
||
|
probabilistic models that model binary events rather than integer
|
||
|
counts.
|
||
|
|
||
|
dtype : type, optional
|
||
|
Type of the matrix returned by fit_transform() or transform().
|
||
|
|
||
|
alternate_sign : boolean, optional, default True
|
||
|
When True, an alternating sign is added to the features as to
|
||
|
approximately conserve the inner product in the hashed space even for
|
||
|
small n_features. This approach is similar to sparse random projection.
|
||
|
|
||
|
.. versionadded:: 0.19
|
||
|
|
||
|
non_negative : boolean, optional, default False
|
||
|
When True, an absolute value is applied to the features matrix prior to
|
||
|
returning it. When used in conjunction with alternate_sign=True, this
|
||
|
significantly reduces the inner product preservation property.
|
||
|
|
||
|
.. deprecated:: 0.19
|
||
|
This option will be removed in 0.21.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
CountVectorizer, TfidfVectorizer
|
||
|
|
||
|
"""
|
||
|
def __init__(self, input='content', encoding='utf-8',
|
||
|
decode_error='strict', strip_accents=None,
|
||
|
lowercase=True, preprocessor=None, tokenizer=None,
|
||
|
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
|
||
|
ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20),
|
||
|
binary=False, norm='l2', alternate_sign=True,
|
||
|
non_negative=False, dtype=np.float64):
|
||
|
self.input = input
|
||
|
self.encoding = encoding
|
||
|
self.decode_error = decode_error
|
||
|
self.strip_accents = strip_accents
|
||
|
self.preprocessor = preprocessor
|
||
|
self.tokenizer = tokenizer
|
||
|
self.analyzer = analyzer
|
||
|
self.lowercase = lowercase
|
||
|
self.token_pattern = token_pattern
|
||
|
self.stop_words = stop_words
|
||
|
self.n_features = n_features
|
||
|
self.ngram_range = ngram_range
|
||
|
self.binary = binary
|
||
|
self.norm = norm
|
||
|
self.alternate_sign = alternate_sign
|
||
|
self.non_negative = non_negative
|
||
|
self.dtype = dtype
|
||
|
|
||
|
def partial_fit(self, X, y=None):
|
||
|
"""Does nothing: this transformer is stateless.
|
||
|
|
||
|
This method is just there to mark the fact that this transformer
|
||
|
can work in a streaming setup.
|
||
|
|
||
|
"""
|
||
|
return self
|
||
|
|
||
|
def fit(self, X, y=None):
|
||
|
"""Does nothing: this transformer is stateless."""
|
||
|
# triggers a parameter validation
|
||
|
if isinstance(X, six.string_types):
|
||
|
raise ValueError(
|
||
|
"Iterable over raw text documents expected, "
|
||
|
"string object received.")
|
||
|
|
||
|
self._get_hasher().fit(X, y=y)
|
||
|
return self
|
||
|
|
||
|
def transform(self, X):
|
||
|
"""Transform a sequence of documents to a document-term matrix.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : iterable over raw text documents, length = n_samples
|
||
|
Samples. Each sample must be a text document (either bytes or
|
||
|
unicode strings, file name or file object depending on the
|
||
|
constructor argument) which will be tokenized and hashed.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X : scipy.sparse matrix, shape = (n_samples, self.n_features)
|
||
|
Document-term matrix.
|
||
|
"""
|
||
|
if isinstance(X, six.string_types):
|
||
|
raise ValueError(
|
||
|
"Iterable over raw text documents expected, "
|
||
|
"string object received.")
|
||
|
|
||
|
analyzer = self.build_analyzer()
|
||
|
X = self._get_hasher().transform(analyzer(doc) for doc in X)
|
||
|
if self.binary:
|
||
|
X.data.fill(1)
|
||
|
if self.norm is not None:
|
||
|
X = normalize(X, norm=self.norm, copy=False)
|
||
|
return X
|
||
|
|
||
|
def _get_hasher(self):
|
||
|
return FeatureHasher(n_features=self.n_features,
|
||
|
input_type='string', dtype=self.dtype,
|
||
|
alternate_sign=self.alternate_sign,
|
||
|
non_negative=self.non_negative)
|
||
|
|
||
|
|
||
|
def _document_frequency(X):
|
||
|
"""Count the number of non-zero values for each feature in sparse X."""
|
||
|
if sp.isspmatrix_csr(X):
|
||
|
return np.bincount(X.indices, minlength=X.shape[1])
|
||
|
else:
|
||
|
return np.diff(sp.csc_matrix(X, copy=False).indptr)
|
||
|
|
||
|
|
||
|
class CountVectorizer(BaseEstimator, VectorizerMixin):
|
||
|
"""Convert a collection of text documents to a matrix of token counts
|
||
|
|
||
|
This implementation produces a sparse representation of the counts using
|
||
|
scipy.sparse.csr_matrix.
|
||
|
|
||
|
If you do not provide an a-priori dictionary and you do not use an analyzer
|
||
|
that does some kind of feature selection then the number of features will
|
||
|
be equal to the vocabulary size found by analyzing the data.
|
||
|
|
||
|
Read more in the :ref:`User Guide <text_feature_extraction>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
input : string {'filename', 'file', 'content'}
|
||
|
If 'filename', the sequence passed as an argument to fit is
|
||
|
expected to be a list of filenames that need reading to fetch
|
||
|
the raw content to analyze.
|
||
|
|
||
|
If 'file', the sequence items must have a 'read' method (file-like
|
||
|
object) that is called to fetch the bytes in memory.
|
||
|
|
||
|
Otherwise the input is expected to be the sequence strings or
|
||
|
bytes items are expected to be analyzed directly.
|
||
|
|
||
|
encoding : string, 'utf-8' by default.
|
||
|
If bytes or files are given to analyze, this encoding is used to
|
||
|
decode.
|
||
|
|
||
|
decode_error : {'strict', 'ignore', 'replace'}
|
||
|
Instruction on what to do if a byte sequence is given to analyze that
|
||
|
contains characters not of the given `encoding`. By default, it is
|
||
|
'strict', meaning that a UnicodeDecodeError will be raised. Other
|
||
|
values are 'ignore' and 'replace'.
|
||
|
|
||
|
strip_accents : {'ascii', 'unicode', None}
|
||
|
Remove accents during the preprocessing step.
|
||
|
'ascii' is a fast method that only works on characters that have
|
||
|
an direct ASCII mapping.
|
||
|
'unicode' is a slightly slower method that works on any characters.
|
||
|
None (default) does nothing.
|
||
|
|
||
|
analyzer : string, {'word', 'char', 'char_wb'} or callable
|
||
|
Whether the feature should be made of word or character n-grams.
|
||
|
Option 'char_wb' creates character n-grams only from text inside
|
||
|
word boundaries; n-grams at the edges of words are padded with space.
|
||
|
|
||
|
If a callable is passed it is used to extract the sequence of features
|
||
|
out of the raw, unprocessed input.
|
||
|
|
||
|
preprocessor : callable or None (default)
|
||
|
Override the preprocessing (string transformation) stage while
|
||
|
preserving the tokenizing and n-grams generation steps.
|
||
|
|
||
|
tokenizer : callable or None (default)
|
||
|
Override the string tokenization step while preserving the
|
||
|
preprocessing and n-grams generation steps.
|
||
|
Only applies if ``analyzer == 'word'``.
|
||
|
|
||
|
ngram_range : tuple (min_n, max_n)
|
||
|
The lower and upper boundary of the range of n-values for different
|
||
|
n-grams to be extracted. All values of n such that min_n <= n <= max_n
|
||
|
will be used.
|
||
|
|
||
|
stop_words : string {'english'}, list, or None (default)
|
||
|
If 'english', a built-in stop word list for English is used.
|
||
|
|
||
|
If a list, that list is assumed to contain stop words, all of which
|
||
|
will be removed from the resulting tokens.
|
||
|
Only applies if ``analyzer == 'word'``.
|
||
|
|
||
|
If None, no stop words will be used. max_df can be set to a value
|
||
|
in the range [0.7, 1.0) to automatically detect and filter stop
|
||
|
words based on intra corpus document frequency of terms.
|
||
|
|
||
|
lowercase : boolean, True by default
|
||
|
Convert all characters to lowercase before tokenizing.
|
||
|
|
||
|
token_pattern : string
|
||
|
Regular expression denoting what constitutes a "token", only used
|
||
|
if ``analyzer == 'word'``. The default regexp select tokens of 2
|
||
|
or more alphanumeric characters (punctuation is completely ignored
|
||
|
and always treated as a token separator).
|
||
|
|
||
|
max_df : float in range [0.0, 1.0] or int, default=1.0
|
||
|
When building the vocabulary ignore terms that have a document
|
||
|
frequency strictly higher than the given threshold (corpus-specific
|
||
|
stop words).
|
||
|
If float, the parameter represents a proportion of documents, integer
|
||
|
absolute counts.
|
||
|
This parameter is ignored if vocabulary is not None.
|
||
|
|
||
|
min_df : float in range [0.0, 1.0] or int, default=1
|
||
|
When building the vocabulary ignore terms that have a document
|
||
|
frequency strictly lower than the given threshold. This value is also
|
||
|
called cut-off in the literature.
|
||
|
If float, the parameter represents a proportion of documents, integer
|
||
|
absolute counts.
|
||
|
This parameter is ignored if vocabulary is not None.
|
||
|
|
||
|
max_features : int or None, default=None
|
||
|
If not None, build a vocabulary that only consider the top
|
||
|
max_features ordered by term frequency across the corpus.
|
||
|
|
||
|
This parameter is ignored if vocabulary is not None.
|
||
|
|
||
|
vocabulary : Mapping or iterable, optional
|
||
|
Either a Mapping (e.g., a dict) where keys are terms and values are
|
||
|
indices in the feature matrix, or an iterable over terms. If not
|
||
|
given, a vocabulary is determined from the input documents. Indices
|
||
|
in the mapping should not be repeated and should not have any gap
|
||
|
between 0 and the largest index.
|
||
|
|
||
|
binary : boolean, default=False
|
||
|
If True, all non zero counts are set to 1. This is useful for discrete
|
||
|
probabilistic models that model binary events rather than integer
|
||
|
counts.
|
||
|
|
||
|
dtype : type, optional
|
||
|
Type of the matrix returned by fit_transform() or transform().
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
vocabulary_ : dict
|
||
|
A mapping of terms to feature indices.
|
||
|
|
||
|
stop_words_ : set
|
||
|
Terms that were ignored because they either:
|
||
|
|
||
|
- occurred in too many documents (`max_df`)
|
||
|
- occurred in too few documents (`min_df`)
|
||
|
- were cut off by feature selection (`max_features`).
|
||
|
|
||
|
This is only available if no vocabulary was given.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
HashingVectorizer, TfidfVectorizer
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The ``stop_words_`` attribute can get large and increase the model size
|
||
|
when pickling. This attribute is provided only for introspection and can
|
||
|
be safely removed using delattr or set to None before pickling.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, input='content', encoding='utf-8',
|
||
|
decode_error='strict', strip_accents=None,
|
||
|
lowercase=True, preprocessor=None, tokenizer=None,
|
||
|
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
|
||
|
ngram_range=(1, 1), analyzer='word',
|
||
|
max_df=1.0, min_df=1, max_features=None,
|
||
|
vocabulary=None, binary=False, dtype=np.int64):
|
||
|
self.input = input
|
||
|
self.encoding = encoding
|
||
|
self.decode_error = decode_error
|
||
|
self.strip_accents = strip_accents
|
||
|
self.preprocessor = preprocessor
|
||
|
self.tokenizer = tokenizer
|
||
|
self.analyzer = analyzer
|
||
|
self.lowercase = lowercase
|
||
|
self.token_pattern = token_pattern
|
||
|
self.stop_words = stop_words
|
||
|
self.max_df = max_df
|
||
|
self.min_df = min_df
|
||
|
if max_df < 0 or min_df < 0:
|
||
|
raise ValueError("negative value for max_df or min_df")
|
||
|
self.max_features = max_features
|
||
|
if max_features is not None:
|
||
|
if (not isinstance(max_features, numbers.Integral) or
|
||
|
max_features <= 0):
|
||
|
raise ValueError(
|
||
|
"max_features=%r, neither a positive integer nor None"
|
||
|
% max_features)
|
||
|
self.ngram_range = ngram_range
|
||
|
self.vocabulary = vocabulary
|
||
|
self.binary = binary
|
||
|
self.dtype = dtype
|
||
|
|
||
|
def _sort_features(self, X, vocabulary):
|
||
|
"""Sort features by name
|
||
|
|
||
|
Returns a reordered matrix and modifies the vocabulary in place
|
||
|
"""
|
||
|
sorted_features = sorted(six.iteritems(vocabulary))
|
||
|
map_index = np.empty(len(sorted_features), dtype=np.int32)
|
||
|
for new_val, (term, old_val) in enumerate(sorted_features):
|
||
|
vocabulary[term] = new_val
|
||
|
map_index[old_val] = new_val
|
||
|
|
||
|
X.indices = map_index.take(X.indices, mode='clip')
|
||
|
return X
|
||
|
|
||
|
def _limit_features(self, X, vocabulary, high=None, low=None,
|
||
|
limit=None):
|
||
|
"""Remove too rare or too common features.
|
||
|
|
||
|
Prune features that are non zero in more samples than high or less
|
||
|
documents than low, modifying the vocabulary, and restricting it to
|
||
|
at most the limit most frequent.
|
||
|
|
||
|
This does not prune samples with zero features.
|
||
|
"""
|
||
|
if high is None and low is None and limit is None:
|
||
|
return X, set()
|
||
|
|
||
|
# Calculate a mask based on document frequencies
|
||
|
dfs = _document_frequency(X)
|
||
|
tfs = np.asarray(X.sum(axis=0)).ravel()
|
||
|
mask = np.ones(len(dfs), dtype=bool)
|
||
|
if high is not None:
|
||
|
mask &= dfs <= high
|
||
|
if low is not None:
|
||
|
mask &= dfs >= low
|
||
|
if limit is not None and mask.sum() > limit:
|
||
|
mask_inds = (-tfs[mask]).argsort()[:limit]
|
||
|
new_mask = np.zeros(len(dfs), dtype=bool)
|
||
|
new_mask[np.where(mask)[0][mask_inds]] = True
|
||
|
mask = new_mask
|
||
|
|
||
|
new_indices = np.cumsum(mask) - 1 # maps old indices to new
|
||
|
removed_terms = set()
|
||
|
for term, old_index in list(six.iteritems(vocabulary)):
|
||
|
if mask[old_index]:
|
||
|
vocabulary[term] = new_indices[old_index]
|
||
|
else:
|
||
|
del vocabulary[term]
|
||
|
removed_terms.add(term)
|
||
|
kept_indices = np.where(mask)[0]
|
||
|
if len(kept_indices) == 0:
|
||
|
raise ValueError("After pruning, no terms remain. Try a lower"
|
||
|
" min_df or a higher max_df.")
|
||
|
return X[:, kept_indices], removed_terms
|
||
|
|
||
|
def _count_vocab(self, raw_documents, fixed_vocab):
|
||
|
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
|
||
|
"""
|
||
|
if fixed_vocab:
|
||
|
vocabulary = self.vocabulary_
|
||
|
else:
|
||
|
# Add a new value when a new vocabulary item is seen
|
||
|
vocabulary = defaultdict()
|
||
|
vocabulary.default_factory = vocabulary.__len__
|
||
|
|
||
|
analyze = self.build_analyzer()
|
||
|
j_indices = []
|
||
|
indptr = _make_int_array()
|
||
|
values = _make_int_array()
|
||
|
indptr.append(0)
|
||
|
for doc in raw_documents:
|
||
|
feature_counter = {}
|
||
|
for feature in analyze(doc):
|
||
|
try:
|
||
|
feature_idx = vocabulary[feature]
|
||
|
if feature_idx not in feature_counter:
|
||
|
feature_counter[feature_idx] = 1
|
||
|
else:
|
||
|
feature_counter[feature_idx] += 1
|
||
|
except KeyError:
|
||
|
# Ignore out-of-vocabulary items for fixed_vocab=True
|
||
|
continue
|
||
|
|
||
|
j_indices.extend(feature_counter.keys())
|
||
|
values.extend(feature_counter.values())
|
||
|
indptr.append(len(j_indices))
|
||
|
|
||
|
if not fixed_vocab:
|
||
|
# disable defaultdict behaviour
|
||
|
vocabulary = dict(vocabulary)
|
||
|
if not vocabulary:
|
||
|
raise ValueError("empty vocabulary; perhaps the documents only"
|
||
|
" contain stop words")
|
||
|
|
||
|
j_indices = np.asarray(j_indices, dtype=np.intc)
|
||
|
indptr = np.frombuffer(indptr, dtype=np.intc)
|
||
|
values = np.frombuffer(values, dtype=np.intc)
|
||
|
|
||
|
X = sp.csr_matrix((values, j_indices, indptr),
|
||
|
shape=(len(indptr) - 1, len(vocabulary)),
|
||
|
dtype=self.dtype)
|
||
|
X.sort_indices()
|
||
|
return vocabulary, X
|
||
|
|
||
|
def fit(self, raw_documents, y=None):
|
||
|
"""Learn a vocabulary dictionary of all tokens in the raw documents.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_documents : iterable
|
||
|
An iterable which yields either str, unicode or file objects.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self
|
||
|
"""
|
||
|
self.fit_transform(raw_documents)
|
||
|
return self
|
||
|
|
||
|
def fit_transform(self, raw_documents, y=None):
|
||
|
"""Learn the vocabulary dictionary and return term-document matrix.
|
||
|
|
||
|
This is equivalent to fit followed by transform, but more efficiently
|
||
|
implemented.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_documents : iterable
|
||
|
An iterable which yields either str, unicode or file objects.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X : array, [n_samples, n_features]
|
||
|
Document-term matrix.
|
||
|
"""
|
||
|
# We intentionally don't call the transform method to make
|
||
|
# fit_transform overridable without unwanted side effects in
|
||
|
# TfidfVectorizer.
|
||
|
if isinstance(raw_documents, six.string_types):
|
||
|
raise ValueError(
|
||
|
"Iterable over raw text documents expected, "
|
||
|
"string object received.")
|
||
|
|
||
|
self._validate_vocabulary()
|
||
|
max_df = self.max_df
|
||
|
min_df = self.min_df
|
||
|
max_features = self.max_features
|
||
|
|
||
|
vocabulary, X = self._count_vocab(raw_documents,
|
||
|
self.fixed_vocabulary_)
|
||
|
|
||
|
if self.binary:
|
||
|
X.data.fill(1)
|
||
|
|
||
|
if not self.fixed_vocabulary_:
|
||
|
X = self._sort_features(X, vocabulary)
|
||
|
|
||
|
n_doc = X.shape[0]
|
||
|
max_doc_count = (max_df
|
||
|
if isinstance(max_df, numbers.Integral)
|
||
|
else max_df * n_doc)
|
||
|
min_doc_count = (min_df
|
||
|
if isinstance(min_df, numbers.Integral)
|
||
|
else min_df * n_doc)
|
||
|
if max_doc_count < min_doc_count:
|
||
|
raise ValueError(
|
||
|
"max_df corresponds to < documents than min_df")
|
||
|
X, self.stop_words_ = self._limit_features(X, vocabulary,
|
||
|
max_doc_count,
|
||
|
min_doc_count,
|
||
|
max_features)
|
||
|
|
||
|
self.vocabulary_ = vocabulary
|
||
|
|
||
|
return X
|
||
|
|
||
|
def transform(self, raw_documents):
|
||
|
"""Transform documents to document-term matrix.
|
||
|
|
||
|
Extract token counts out of raw text documents using the vocabulary
|
||
|
fitted with fit or the one provided to the constructor.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_documents : iterable
|
||
|
An iterable which yields either str, unicode or file objects.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X : sparse matrix, [n_samples, n_features]
|
||
|
Document-term matrix.
|
||
|
"""
|
||
|
if isinstance(raw_documents, six.string_types):
|
||
|
raise ValueError(
|
||
|
"Iterable over raw text documents expected, "
|
||
|
"string object received.")
|
||
|
|
||
|
if not hasattr(self, 'vocabulary_'):
|
||
|
self._validate_vocabulary()
|
||
|
|
||
|
self._check_vocabulary()
|
||
|
|
||
|
# use the same matrix-building strategy as fit_transform
|
||
|
_, X = self._count_vocab(raw_documents, fixed_vocab=True)
|
||
|
if self.binary:
|
||
|
X.data.fill(1)
|
||
|
return X
|
||
|
|
||
|
def inverse_transform(self, X):
|
||
|
"""Return terms per document with nonzero entries in X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array, sparse matrix}, shape = [n_samples, n_features]
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X_inv : list of arrays, len = n_samples
|
||
|
List of arrays of terms.
|
||
|
"""
|
||
|
self._check_vocabulary()
|
||
|
|
||
|
if sp.issparse(X):
|
||
|
# We need CSR format for fast row manipulations.
|
||
|
X = X.tocsr()
|
||
|
else:
|
||
|
# We need to convert X to a matrix, so that the indexing
|
||
|
# returns 2D objects
|
||
|
X = np.asmatrix(X)
|
||
|
n_samples = X.shape[0]
|
||
|
|
||
|
terms = np.array(list(self.vocabulary_.keys()))
|
||
|
indices = np.array(list(self.vocabulary_.values()))
|
||
|
inverse_vocabulary = terms[np.argsort(indices)]
|
||
|
|
||
|
return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
|
||
|
for i in range(n_samples)]
|
||
|
|
||
|
def get_feature_names(self):
|
||
|
"""Array mapping from feature integer indices to feature name"""
|
||
|
self._check_vocabulary()
|
||
|
|
||
|
return [t for t, i in sorted(six.iteritems(self.vocabulary_),
|
||
|
key=itemgetter(1))]
|
||
|
|
||
|
|
||
|
def _make_int_array():
|
||
|
"""Construct an array.array of a type suitable for scipy.sparse indices."""
|
||
|
return array.array(str("i"))
|
||
|
|
||
|
|
||
|
class TfidfTransformer(BaseEstimator, TransformerMixin):
|
||
|
"""Transform a count matrix to a normalized tf or tf-idf representation
|
||
|
|
||
|
Tf means term-frequency while tf-idf means term-frequency times inverse
|
||
|
document-frequency. This is a common term weighting scheme in information
|
||
|
retrieval, that has also found good use in document classification.
|
||
|
|
||
|
The goal of using tf-idf instead of the raw frequencies of occurrence of a
|
||
|
token in a given document is to scale down the impact of tokens that occur
|
||
|
very frequently in a given corpus and that are hence empirically less
|
||
|
informative than features that occur in a small fraction of the training
|
||
|
corpus.
|
||
|
|
||
|
The formula that is used to compute the tf-idf of term t is
|
||
|
tf-idf(d, t) = tf(t) * idf(d, t), and the idf is computed as
|
||
|
idf(d, t) = log [ n / df(d, t) ] + 1 (if ``smooth_idf=False``),
|
||
|
where n is the total number of documents and df(d, t) is the
|
||
|
document frequency; the document frequency is the number of documents d
|
||
|
that contain term t. The effect of adding "1" to the idf in the equation
|
||
|
above is that terms with zero idf, i.e., terms that occur in all documents
|
||
|
in a training set, will not be entirely ignored.
|
||
|
(Note that the idf formula above differs from the standard
|
||
|
textbook notation that defines the idf as
|
||
|
idf(d, t) = log [ n / (df(d, t) + 1) ]).
|
||
|
|
||
|
If ``smooth_idf=True`` (the default), the constant "1" is added to the
|
||
|
numerator and denominator of the idf as if an extra document was seen
|
||
|
containing every term in the collection exactly once, which prevents
|
||
|
zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1.
|
||
|
|
||
|
Furthermore, the formulas used to compute tf and idf depend
|
||
|
on parameter settings that correspond to the SMART notation used in IR
|
||
|
as follows:
|
||
|
|
||
|
Tf is "n" (natural) by default, "l" (logarithmic) when
|
||
|
``sublinear_tf=True``.
|
||
|
Idf is "t" when use_idf is given, "n" (none) otherwise.
|
||
|
Normalization is "c" (cosine) when ``norm='l2'``, "n" (none)
|
||
|
when ``norm=None``.
|
||
|
|
||
|
Read more in the :ref:`User Guide <text_feature_extraction>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
norm : 'l1', 'l2' or None, optional
|
||
|
Norm used to normalize term vectors. None for no normalization.
|
||
|
|
||
|
use_idf : boolean, default=True
|
||
|
Enable inverse-document-frequency reweighting.
|
||
|
|
||
|
smooth_idf : boolean, default=True
|
||
|
Smooth idf weights by adding one to document frequencies, as if an
|
||
|
extra document was seen containing every term in the collection
|
||
|
exactly once. Prevents zero divisions.
|
||
|
|
||
|
sublinear_tf : boolean, default=False
|
||
|
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
|
||
|
.. [Yates2011] `R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
|
||
|
Information Retrieval. Addison Wesley, pp. 68-74.`
|
||
|
|
||
|
.. [MRS2008] `C.D. Manning, P. Raghavan and H. Schütze (2008).
|
||
|
Introduction to Information Retrieval. Cambridge University
|
||
|
Press, pp. 118-120.`
|
||
|
"""
|
||
|
|
||
|
def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
|
||
|
sublinear_tf=False):
|
||
|
self.norm = norm
|
||
|
self.use_idf = use_idf
|
||
|
self.smooth_idf = smooth_idf
|
||
|
self.sublinear_tf = sublinear_tf
|
||
|
|
||
|
def fit(self, X, y=None):
|
||
|
"""Learn the idf vector (global term weights)
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : sparse matrix, [n_samples, n_features]
|
||
|
a matrix of term/token counts
|
||
|
"""
|
||
|
if not sp.issparse(X):
|
||
|
X = sp.csc_matrix(X)
|
||
|
if self.use_idf:
|
||
|
n_samples, n_features = X.shape
|
||
|
df = _document_frequency(X)
|
||
|
|
||
|
# perform idf smoothing if required
|
||
|
df += int(self.smooth_idf)
|
||
|
n_samples += int(self.smooth_idf)
|
||
|
|
||
|
# log+1 instead of log makes sure terms with zero idf don't get
|
||
|
# suppressed entirely.
|
||
|
idf = np.log(float(n_samples) / df) + 1.0
|
||
|
self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
|
||
|
n=n_features, format='csr')
|
||
|
|
||
|
return self
|
||
|
|
||
|
def transform(self, X, copy=True):
|
||
|
"""Transform a count matrix to a tf or tf-idf representation
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : sparse matrix, [n_samples, n_features]
|
||
|
a matrix of term/token counts
|
||
|
|
||
|
copy : boolean, default True
|
||
|
Whether to copy X and operate on the copy or perform in-place
|
||
|
operations.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
vectors : sparse matrix, [n_samples, n_features]
|
||
|
"""
|
||
|
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.floating):
|
||
|
# preserve float family dtype
|
||
|
X = sp.csr_matrix(X, copy=copy)
|
||
|
else:
|
||
|
# convert counts or binary occurrences to floats
|
||
|
X = sp.csr_matrix(X, dtype=np.float64, copy=copy)
|
||
|
|
||
|
n_samples, n_features = X.shape
|
||
|
|
||
|
if self.sublinear_tf:
|
||
|
np.log(X.data, X.data)
|
||
|
X.data += 1
|
||
|
|
||
|
if self.use_idf:
|
||
|
check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
|
||
|
|
||
|
expected_n_features = self._idf_diag.shape[0]
|
||
|
if n_features != expected_n_features:
|
||
|
raise ValueError("Input has n_features=%d while the model"
|
||
|
" has been trained with n_features=%d" % (
|
||
|
n_features, expected_n_features))
|
||
|
# *= doesn't work
|
||
|
X = X * self._idf_diag
|
||
|
|
||
|
if self.norm:
|
||
|
X = normalize(X, norm=self.norm, copy=False)
|
||
|
|
||
|
return X
|
||
|
|
||
|
@property
|
||
|
def idf_(self):
|
||
|
# if _idf_diag is not set, this will raise an attribute error,
|
||
|
# which means hasattr(self, "idf_") is False
|
||
|
return np.ravel(self._idf_diag.sum(axis=0))
|
||
|
|
||
|
|
||
|
class TfidfVectorizer(CountVectorizer):
|
||
|
"""Convert a collection of raw documents to a matrix of TF-IDF features.
|
||
|
|
||
|
Equivalent to CountVectorizer followed by TfidfTransformer.
|
||
|
|
||
|
Read more in the :ref:`User Guide <text_feature_extraction>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
input : string {'filename', 'file', 'content'}
|
||
|
If 'filename', the sequence passed as an argument to fit is
|
||
|
expected to be a list of filenames that need reading to fetch
|
||
|
the raw content to analyze.
|
||
|
|
||
|
If 'file', the sequence items must have a 'read' method (file-like
|
||
|
object) that is called to fetch the bytes in memory.
|
||
|
|
||
|
Otherwise the input is expected to be the sequence strings or
|
||
|
bytes items are expected to be analyzed directly.
|
||
|
|
||
|
encoding : string, 'utf-8' by default.
|
||
|
If bytes or files are given to analyze, this encoding is used to
|
||
|
decode.
|
||
|
|
||
|
decode_error : {'strict', 'ignore', 'replace'}
|
||
|
Instruction on what to do if a byte sequence is given to analyze that
|
||
|
contains characters not of the given `encoding`. By default, it is
|
||
|
'strict', meaning that a UnicodeDecodeError will be raised. Other
|
||
|
values are 'ignore' and 'replace'.
|
||
|
|
||
|
strip_accents : {'ascii', 'unicode', None}
|
||
|
Remove accents during the preprocessing step.
|
||
|
'ascii' is a fast method that only works on characters that have
|
||
|
an direct ASCII mapping.
|
||
|
'unicode' is a slightly slower method that works on any characters.
|
||
|
None (default) does nothing.
|
||
|
|
||
|
analyzer : string, {'word', 'char'} or callable
|
||
|
Whether the feature should be made of word or character n-grams.
|
||
|
|
||
|
If a callable is passed it is used to extract the sequence of features
|
||
|
out of the raw, unprocessed input.
|
||
|
|
||
|
preprocessor : callable or None (default)
|
||
|
Override the preprocessing (string transformation) stage while
|
||
|
preserving the tokenizing and n-grams generation steps.
|
||
|
|
||
|
tokenizer : callable or None (default)
|
||
|
Override the string tokenization step while preserving the
|
||
|
preprocessing and n-grams generation steps.
|
||
|
Only applies if ``analyzer == 'word'``.
|
||
|
|
||
|
ngram_range : tuple (min_n, max_n)
|
||
|
The lower and upper boundary of the range of n-values for different
|
||
|
n-grams to be extracted. All values of n such that min_n <= n <= max_n
|
||
|
will be used.
|
||
|
|
||
|
stop_words : string {'english'}, list, or None (default)
|
||
|
If a string, it is passed to _check_stop_list and the appropriate stop
|
||
|
list is returned. 'english' is currently the only supported string
|
||
|
value.
|
||
|
|
||
|
If a list, that list is assumed to contain stop words, all of which
|
||
|
will be removed from the resulting tokens.
|
||
|
Only applies if ``analyzer == 'word'``.
|
||
|
|
||
|
If None, no stop words will be used. max_df can be set to a value
|
||
|
in the range [0.7, 1.0) to automatically detect and filter stop
|
||
|
words based on intra corpus document frequency of terms.
|
||
|
|
||
|
lowercase : boolean, default True
|
||
|
Convert all characters to lowercase before tokenizing.
|
||
|
|
||
|
token_pattern : string
|
||
|
Regular expression denoting what constitutes a "token", only used
|
||
|
if ``analyzer == 'word'``. The default regexp selects tokens of 2
|
||
|
or more alphanumeric characters (punctuation is completely ignored
|
||
|
and always treated as a token separator).
|
||
|
|
||
|
max_df : float in range [0.0, 1.0] or int, default=1.0
|
||
|
When building the vocabulary ignore terms that have a document
|
||
|
frequency strictly higher than the given threshold (corpus-specific
|
||
|
stop words).
|
||
|
If float, the parameter represents a proportion of documents, integer
|
||
|
absolute counts.
|
||
|
This parameter is ignored if vocabulary is not None.
|
||
|
|
||
|
min_df : float in range [0.0, 1.0] or int, default=1
|
||
|
When building the vocabulary ignore terms that have a document
|
||
|
frequency strictly lower than the given threshold. This value is also
|
||
|
called cut-off in the literature.
|
||
|
If float, the parameter represents a proportion of documents, integer
|
||
|
absolute counts.
|
||
|
This parameter is ignored if vocabulary is not None.
|
||
|
|
||
|
max_features : int or None, default=None
|
||
|
If not None, build a vocabulary that only consider the top
|
||
|
max_features ordered by term frequency across the corpus.
|
||
|
|
||
|
This parameter is ignored if vocabulary is not None.
|
||
|
|
||
|
vocabulary : Mapping or iterable, optional
|
||
|
Either a Mapping (e.g., a dict) where keys are terms and values are
|
||
|
indices in the feature matrix, or an iterable over terms. If not
|
||
|
given, a vocabulary is determined from the input documents.
|
||
|
|
||
|
binary : boolean, default=False
|
||
|
If True, all non-zero term counts are set to 1. This does not mean
|
||
|
outputs will have only 0/1 values, only that the tf term in tf-idf
|
||
|
is binary. (Set idf and normalization to False to get 0/1 outputs.)
|
||
|
|
||
|
dtype : type, optional
|
||
|
Type of the matrix returned by fit_transform() or transform().
|
||
|
|
||
|
norm : 'l1', 'l2' or None, optional
|
||
|
Norm used to normalize term vectors. None for no normalization.
|
||
|
|
||
|
use_idf : boolean, default=True
|
||
|
Enable inverse-document-frequency reweighting.
|
||
|
|
||
|
smooth_idf : boolean, default=True
|
||
|
Smooth idf weights by adding one to document frequencies, as if an
|
||
|
extra document was seen containing every term in the collection
|
||
|
exactly once. Prevents zero divisions.
|
||
|
|
||
|
sublinear_tf : boolean, default=False
|
||
|
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
vocabulary_ : dict
|
||
|
A mapping of terms to feature indices.
|
||
|
|
||
|
idf_ : array, shape = [n_features], or None
|
||
|
The learned idf vector (global term weights)
|
||
|
when ``use_idf`` is set to True, None otherwise.
|
||
|
|
||
|
stop_words_ : set
|
||
|
Terms that were ignored because they either:
|
||
|
|
||
|
- occurred in too many documents (`max_df`)
|
||
|
- occurred in too few documents (`min_df`)
|
||
|
- were cut off by feature selection (`max_features`).
|
||
|
|
||
|
This is only available if no vocabulary was given.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
CountVectorizer
|
||
|
Tokenize the documents and count the occurrences of token and return
|
||
|
them as a sparse matrix
|
||
|
|
||
|
TfidfTransformer
|
||
|
Apply Term Frequency Inverse Document Frequency normalization to a
|
||
|
sparse matrix of occurrence counts.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The ``stop_words_`` attribute can get large and increase the model size
|
||
|
when pickling. This attribute is provided only for introspection and can
|
||
|
be safely removed using delattr or set to None before pickling.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, input='content', encoding='utf-8',
|
||
|
decode_error='strict', strip_accents=None, lowercase=True,
|
||
|
preprocessor=None, tokenizer=None, analyzer='word',
|
||
|
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
|
||
|
ngram_range=(1, 1), max_df=1.0, min_df=1,
|
||
|
max_features=None, vocabulary=None, binary=False,
|
||
|
dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
|
||
|
sublinear_tf=False):
|
||
|
|
||
|
super(TfidfVectorizer, self).__init__(
|
||
|
input=input, encoding=encoding, decode_error=decode_error,
|
||
|
strip_accents=strip_accents, lowercase=lowercase,
|
||
|
preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
|
||
|
stop_words=stop_words, token_pattern=token_pattern,
|
||
|
ngram_range=ngram_range, max_df=max_df, min_df=min_df,
|
||
|
max_features=max_features, vocabulary=vocabulary, binary=binary,
|
||
|
dtype=dtype)
|
||
|
|
||
|
self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
|
||
|
smooth_idf=smooth_idf,
|
||
|
sublinear_tf=sublinear_tf)
|
||
|
|
||
|
# Broadcast the TF-IDF parameters to the underlying transformer instance
|
||
|
# for easy grid search and repr
|
||
|
|
||
|
@property
|
||
|
def norm(self):
|
||
|
return self._tfidf.norm
|
||
|
|
||
|
@norm.setter
|
||
|
def norm(self, value):
|
||
|
self._tfidf.norm = value
|
||
|
|
||
|
@property
|
||
|
def use_idf(self):
|
||
|
return self._tfidf.use_idf
|
||
|
|
||
|
@use_idf.setter
|
||
|
def use_idf(self, value):
|
||
|
self._tfidf.use_idf = value
|
||
|
|
||
|
@property
|
||
|
def smooth_idf(self):
|
||
|
return self._tfidf.smooth_idf
|
||
|
|
||
|
@smooth_idf.setter
|
||
|
def smooth_idf(self, value):
|
||
|
self._tfidf.smooth_idf = value
|
||
|
|
||
|
@property
|
||
|
def sublinear_tf(self):
|
||
|
return self._tfidf.sublinear_tf
|
||
|
|
||
|
@sublinear_tf.setter
|
||
|
def sublinear_tf(self, value):
|
||
|
self._tfidf.sublinear_tf = value
|
||
|
|
||
|
@property
|
||
|
def idf_(self):
|
||
|
return self._tfidf.idf_
|
||
|
|
||
|
def fit(self, raw_documents, y=None):
|
||
|
"""Learn vocabulary and idf from training set.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_documents : iterable
|
||
|
an iterable which yields either str, unicode or file objects
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
self : TfidfVectorizer
|
||
|
"""
|
||
|
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
|
||
|
self._tfidf.fit(X)
|
||
|
return self
|
||
|
|
||
|
def fit_transform(self, raw_documents, y=None):
|
||
|
"""Learn vocabulary and idf, return term-document matrix.
|
||
|
|
||
|
This is equivalent to fit followed by transform, but more efficiently
|
||
|
implemented.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_documents : iterable
|
||
|
an iterable which yields either str, unicode or file objects
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X : sparse matrix, [n_samples, n_features]
|
||
|
Tf-idf-weighted document-term matrix.
|
||
|
"""
|
||
|
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
|
||
|
self._tfidf.fit(X)
|
||
|
# X is already a transformed view of raw_documents so
|
||
|
# we set copy to False
|
||
|
return self._tfidf.transform(X, copy=False)
|
||
|
|
||
|
def transform(self, raw_documents, copy=True):
|
||
|
"""Transform documents to document-term matrix.
|
||
|
|
||
|
Uses the vocabulary and document frequencies (df) learned by fit (or
|
||
|
fit_transform).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
raw_documents : iterable
|
||
|
an iterable which yields either str, unicode or file objects
|
||
|
|
||
|
copy : boolean, default True
|
||
|
Whether to copy X and operate on the copy or perform in-place
|
||
|
operations.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X : sparse matrix, [n_samples, n_features]
|
||
|
Tf-idf-weighted document-term matrix.
|
||
|
"""
|
||
|
check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
|
||
|
|
||
|
X = super(TfidfVectorizer, self).transform(raw_documents)
|
||
|
return self._tfidf.transform(X, copy=False)
|