999 lines
36 KiB
Python
999 lines
36 KiB
Python
from __future__ import unicode_literals
|
|
import warnings
|
|
|
|
from sklearn.feature_extraction.text import strip_tags
|
|
from sklearn.feature_extraction.text import strip_accents_unicode
|
|
from sklearn.feature_extraction.text import strip_accents_ascii
|
|
|
|
from sklearn.feature_extraction.text import HashingVectorizer
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import cross_val_score
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.svm import LinearSVC
|
|
|
|
from sklearn.base import clone
|
|
|
|
import numpy as np
|
|
from numpy.testing import assert_array_almost_equal
|
|
from numpy.testing import assert_array_equal
|
|
from numpy.testing import assert_raises
|
|
from sklearn.utils.testing import (assert_equal, assert_false, assert_true,
|
|
assert_not_equal, assert_almost_equal,
|
|
assert_in, assert_less, assert_greater,
|
|
assert_warns_message, assert_raise_message,
|
|
clean_warning_registry, ignore_warnings,
|
|
SkipTest)
|
|
|
|
from collections import defaultdict
|
|
from sklearn.utils.fixes import _Mapping as Mapping
|
|
from functools import partial
|
|
import pickle
|
|
from io import StringIO
|
|
|
|
|
|
JUNK_FOOD_DOCS = (
|
|
"the pizza pizza beer copyright",
|
|
"the pizza burger beer copyright",
|
|
"the the pizza beer beer copyright",
|
|
"the burger beer beer copyright",
|
|
"the coke burger coke copyright",
|
|
"the coke burger burger",
|
|
)
|
|
|
|
NOTJUNK_FOOD_DOCS = (
|
|
"the salad celeri copyright",
|
|
"the salad salad sparkling water copyright",
|
|
"the the celeri celeri copyright",
|
|
"the tomato tomato salad water",
|
|
"the tomato salad water copyright",
|
|
)
|
|
|
|
ALL_FOOD_DOCS = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
|
|
|
|
|
|
def uppercase(s):
|
|
return strip_accents_unicode(s).upper()
|
|
|
|
|
|
def strip_eacute(s):
|
|
return s.replace('\xe9', 'e')
|
|
|
|
|
|
def split_tokenize(s):
|
|
return s.split()
|
|
|
|
|
|
def lazy_analyze(s):
|
|
return ['the_ultimate_feature']
|
|
|
|
|
|
def test_strip_accents():
|
|
# check some classical latin accentuated symbols
|
|
a = '\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
|
|
expected = 'aaaaaaceeee'
|
|
assert_equal(strip_accents_unicode(a), expected)
|
|
|
|
a = '\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
|
|
expected = 'iiiinooooouuuuy'
|
|
assert_equal(strip_accents_unicode(a), expected)
|
|
|
|
# check some arabic
|
|
a = '\u0625' # halef with a hamza below
|
|
expected = '\u0627' # simple halef
|
|
assert_equal(strip_accents_unicode(a), expected)
|
|
|
|
# mix letters accentuated and not
|
|
a = "this is \xe0 test"
|
|
expected = 'this is a test'
|
|
assert_equal(strip_accents_unicode(a), expected)
|
|
|
|
|
|
def test_to_ascii():
|
|
# check some classical latin accentuated symbols
|
|
a = '\xe0\xe1\xe2\xe3\xe4\xe5\xe7\xe8\xe9\xea\xeb'
|
|
expected = 'aaaaaaceeee'
|
|
assert_equal(strip_accents_ascii(a), expected)
|
|
|
|
a = '\xec\xed\xee\xef\xf1\xf2\xf3\xf4\xf5\xf6\xf9\xfa\xfb\xfc\xfd'
|
|
expected = 'iiiinooooouuuuy'
|
|
assert_equal(strip_accents_ascii(a), expected)
|
|
|
|
# check some arabic
|
|
a = '\u0625' # halef with a hamza below
|
|
expected = '' # halef has no direct ascii match
|
|
assert_equal(strip_accents_ascii(a), expected)
|
|
|
|
# mix letters accentuated and not
|
|
a = "this is \xe0 test"
|
|
expected = 'this is a test'
|
|
assert_equal(strip_accents_ascii(a), expected)
|
|
|
|
|
|
def test_word_analyzer_unigrams():
|
|
for Vectorizer in (CountVectorizer, HashingVectorizer):
|
|
wa = Vectorizer(strip_accents='ascii').build_analyzer()
|
|
text = ("J'ai mang\xe9 du kangourou ce midi, "
|
|
"c'\xe9tait pas tr\xeas bon.")
|
|
expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi',
|
|
'etait', 'pas', 'tres', 'bon']
|
|
assert_equal(wa(text), expected)
|
|
|
|
text = "This is a test, really.\n\n I met Harry yesterday."
|
|
expected = ['this', 'is', 'test', 'really', 'met', 'harry',
|
|
'yesterday']
|
|
assert_equal(wa(text), expected)
|
|
|
|
wa = Vectorizer(input='file').build_analyzer()
|
|
text = StringIO("This is a test with a file-like object!")
|
|
expected = ['this', 'is', 'test', 'with', 'file', 'like',
|
|
'object']
|
|
assert_equal(wa(text), expected)
|
|
|
|
# with custom preprocessor
|
|
wa = Vectorizer(preprocessor=uppercase).build_analyzer()
|
|
text = ("J'ai mang\xe9 du kangourou ce midi, "
|
|
" c'\xe9tait pas tr\xeas bon.")
|
|
expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI',
|
|
'ETAIT', 'PAS', 'TRES', 'BON']
|
|
assert_equal(wa(text), expected)
|
|
|
|
# with custom tokenizer
|
|
wa = Vectorizer(tokenizer=split_tokenize,
|
|
strip_accents='ascii').build_analyzer()
|
|
text = ("J'ai mang\xe9 du kangourou ce midi, "
|
|
"c'\xe9tait pas tr\xeas bon.")
|
|
expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,',
|
|
"c'etait", 'pas', 'tres', 'bon.']
|
|
assert_equal(wa(text), expected)
|
|
|
|
|
|
def test_word_analyzer_unigrams_and_bigrams():
|
|
wa = CountVectorizer(analyzer="word", strip_accents='unicode',
|
|
ngram_range=(1, 2)).build_analyzer()
|
|
|
|
text = "J'ai mang\xe9 du kangourou ce midi, c'\xe9tait pas tr\xeas bon."
|
|
expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi',
|
|
'etait', 'pas', 'tres', 'bon', 'ai mange', 'mange du',
|
|
'du kangourou', 'kangourou ce', 'ce midi', 'midi etait',
|
|
'etait pas', 'pas tres', 'tres bon']
|
|
assert_equal(wa(text), expected)
|
|
|
|
|
|
def test_unicode_decode_error():
|
|
# decode_error default to strict, so this should fail
|
|
# First, encode (as bytes) a unicode string.
|
|
text = "J'ai mang\xe9 du kangourou ce midi, c'\xe9tait pas tr\xeas bon."
|
|
text_bytes = text.encode('utf-8')
|
|
|
|
# Then let the Analyzer try to decode it as ascii. It should fail,
|
|
# because we have given it an incorrect encoding.
|
|
wa = CountVectorizer(ngram_range=(1, 2), encoding='ascii').build_analyzer()
|
|
assert_raises(UnicodeDecodeError, wa, text_bytes)
|
|
|
|
ca = CountVectorizer(analyzer='char', ngram_range=(3, 6),
|
|
encoding='ascii').build_analyzer()
|
|
assert_raises(UnicodeDecodeError, ca, text_bytes)
|
|
|
|
|
|
def test_char_ngram_analyzer():
|
|
cnga = CountVectorizer(analyzer='char', strip_accents='unicode',
|
|
ngram_range=(3, 6)).build_analyzer()
|
|
|
|
text = "J'ai mang\xe9 du kangourou ce midi, c'\xe9tait pas tr\xeas bon"
|
|
expected = ["j'a", "'ai", 'ai ', 'i m', ' ma']
|
|
assert_equal(cnga(text)[:5], expected)
|
|
expected = ['s tres', ' tres ', 'tres b', 'res bo', 'es bon']
|
|
assert_equal(cnga(text)[-5:], expected)
|
|
|
|
text = "This \n\tis a test, really.\n\n I met Harry yesterday"
|
|
expected = ['thi', 'his', 'is ', 's i', ' is']
|
|
assert_equal(cnga(text)[:5], expected)
|
|
|
|
expected = [' yeste', 'yester', 'esterd', 'sterda', 'terday']
|
|
assert_equal(cnga(text)[-5:], expected)
|
|
|
|
cnga = CountVectorizer(input='file', analyzer='char',
|
|
ngram_range=(3, 6)).build_analyzer()
|
|
text = StringIO("This is a test with a file-like object!")
|
|
expected = ['thi', 'his', 'is ', 's i', ' is']
|
|
assert_equal(cnga(text)[:5], expected)
|
|
|
|
|
|
def test_char_wb_ngram_analyzer():
|
|
cnga = CountVectorizer(analyzer='char_wb', strip_accents='unicode',
|
|
ngram_range=(3, 6)).build_analyzer()
|
|
|
|
text = "This \n\tis a test, really.\n\n I met Harry yesterday"
|
|
expected = [' th', 'thi', 'his', 'is ', ' thi']
|
|
assert_equal(cnga(text)[:5], expected)
|
|
|
|
expected = ['yester', 'esterd', 'sterda', 'terday', 'erday ']
|
|
assert_equal(cnga(text)[-5:], expected)
|
|
|
|
cnga = CountVectorizer(input='file', analyzer='char_wb',
|
|
ngram_range=(3, 6)).build_analyzer()
|
|
text = StringIO("A test with a file-like object!")
|
|
expected = [' a ', ' te', 'tes', 'est', 'st ', ' tes']
|
|
assert_equal(cnga(text)[:6], expected)
|
|
|
|
|
|
def test_word_ngram_analyzer():
|
|
cnga = CountVectorizer(analyzer='word', strip_accents='unicode',
|
|
ngram_range=(3, 6)).build_analyzer()
|
|
|
|
text = "This \n\tis a test, really.\n\n I met Harry yesterday"
|
|
expected = ['this is test', 'is test really', 'test really met']
|
|
assert_equal(cnga(text)[:3], expected)
|
|
|
|
expected = ['test really met harry yesterday',
|
|
'this is test really met harry',
|
|
'is test really met harry yesterday']
|
|
assert_equal(cnga(text)[-3:], expected)
|
|
|
|
cnga_file = CountVectorizer(input='file', analyzer='word',
|
|
ngram_range=(3, 6)).build_analyzer()
|
|
file = StringIO(text)
|
|
assert_equal(cnga_file(file), cnga(text))
|
|
|
|
|
|
def test_countvectorizer_custom_vocabulary():
|
|
vocab = {"pizza": 0, "beer": 1}
|
|
terms = set(vocab.keys())
|
|
|
|
# Try a few of the supported types.
|
|
for typ in [dict, list, iter, partial(defaultdict, int)]:
|
|
v = typ(vocab)
|
|
vect = CountVectorizer(vocabulary=v)
|
|
vect.fit(JUNK_FOOD_DOCS)
|
|
if isinstance(v, Mapping):
|
|
assert_equal(vect.vocabulary_, vocab)
|
|
else:
|
|
assert_equal(set(vect.vocabulary_), terms)
|
|
X = vect.transform(JUNK_FOOD_DOCS)
|
|
assert_equal(X.shape[1], len(terms))
|
|
|
|
|
|
def test_countvectorizer_custom_vocabulary_pipeline():
|
|
what_we_like = ["pizza", "beer"]
|
|
pipe = Pipeline([
|
|
('count', CountVectorizer(vocabulary=what_we_like)),
|
|
('tfidf', TfidfTransformer())])
|
|
X = pipe.fit_transform(ALL_FOOD_DOCS)
|
|
assert_equal(set(pipe.named_steps['count'].vocabulary_),
|
|
set(what_we_like))
|
|
assert_equal(X.shape[1], len(what_we_like))
|
|
|
|
|
|
def test_countvectorizer_custom_vocabulary_repeated_indeces():
|
|
vocab = {"pizza": 0, "beer": 0}
|
|
try:
|
|
CountVectorizer(vocabulary=vocab)
|
|
except ValueError as e:
|
|
assert_in("vocabulary contains repeated indices", str(e).lower())
|
|
|
|
|
|
def test_countvectorizer_custom_vocabulary_gap_index():
|
|
vocab = {"pizza": 1, "beer": 2}
|
|
try:
|
|
CountVectorizer(vocabulary=vocab)
|
|
except ValueError as e:
|
|
assert_in("doesn't contain index", str(e).lower())
|
|
|
|
|
|
def test_countvectorizer_stop_words():
|
|
cv = CountVectorizer()
|
|
cv.set_params(stop_words='english')
|
|
assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
|
|
cv.set_params(stop_words='_bad_str_stop_')
|
|
assert_raises(ValueError, cv.get_stop_words)
|
|
cv.set_params(stop_words='_bad_unicode_stop_')
|
|
assert_raises(ValueError, cv.get_stop_words)
|
|
stoplist = ['some', 'other', 'words']
|
|
cv.set_params(stop_words=stoplist)
|
|
assert_equal(cv.get_stop_words(), set(stoplist))
|
|
|
|
|
|
def test_countvectorizer_empty_vocabulary():
|
|
try:
|
|
vect = CountVectorizer(vocabulary=[])
|
|
vect.fit(["foo"])
|
|
assert False, "we shouldn't get here"
|
|
except ValueError as e:
|
|
assert_in("empty vocabulary", str(e).lower())
|
|
|
|
try:
|
|
v = CountVectorizer(max_df=1.0, stop_words="english")
|
|
# fit on stopwords only
|
|
v.fit(["to be or not to be", "and me too", "and so do you"])
|
|
assert False, "we shouldn't get here"
|
|
except ValueError as e:
|
|
assert_in("empty vocabulary", str(e).lower())
|
|
|
|
|
|
def test_fit_countvectorizer_twice():
|
|
cv = CountVectorizer()
|
|
X1 = cv.fit_transform(ALL_FOOD_DOCS[:5])
|
|
X2 = cv.fit_transform(ALL_FOOD_DOCS[5:])
|
|
assert_not_equal(X1.shape[1], X2.shape[1])
|
|
|
|
|
|
def test_tf_idf_smoothing():
|
|
X = [[1, 1, 1],
|
|
[1, 1, 0],
|
|
[1, 0, 0]]
|
|
tr = TfidfTransformer(smooth_idf=True, norm='l2')
|
|
tfidf = tr.fit_transform(X).toarray()
|
|
assert_true((tfidf >= 0).all())
|
|
|
|
# check normalization
|
|
assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])
|
|
|
|
# this is robust to features with only zeros
|
|
X = [[1, 1, 0],
|
|
[1, 1, 0],
|
|
[1, 0, 0]]
|
|
tr = TfidfTransformer(smooth_idf=True, norm='l2')
|
|
tfidf = tr.fit_transform(X).toarray()
|
|
assert_true((tfidf >= 0).all())
|
|
|
|
|
|
def test_tfidf_no_smoothing():
|
|
X = [[1, 1, 1],
|
|
[1, 1, 0],
|
|
[1, 0, 0]]
|
|
tr = TfidfTransformer(smooth_idf=False, norm='l2')
|
|
tfidf = tr.fit_transform(X).toarray()
|
|
assert_true((tfidf >= 0).all())
|
|
|
|
# check normalization
|
|
assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])
|
|
|
|
# the lack of smoothing make IDF fragile in the presence of feature with
|
|
# only zeros
|
|
X = [[1, 1, 0],
|
|
[1, 1, 0],
|
|
[1, 0, 0]]
|
|
tr = TfidfTransformer(smooth_idf=False, norm='l2')
|
|
|
|
clean_warning_registry()
|
|
with warnings.catch_warnings(record=True) as w:
|
|
1. / np.array([0.])
|
|
numpy_provides_div0_warning = len(w) == 1
|
|
|
|
in_warning_message = 'divide by zero'
|
|
tfidf = assert_warns_message(RuntimeWarning, in_warning_message,
|
|
tr.fit_transform, X).toarray()
|
|
if not numpy_provides_div0_warning:
|
|
raise SkipTest("Numpy does not provide div 0 warnings.")
|
|
|
|
|
|
def test_sublinear_tf():
|
|
X = [[1], [2], [3]]
|
|
tr = TfidfTransformer(sublinear_tf=True, use_idf=False, norm=None)
|
|
tfidf = tr.fit_transform(X).toarray()
|
|
assert_equal(tfidf[0], 1)
|
|
assert_greater(tfidf[1], tfidf[0])
|
|
assert_greater(tfidf[2], tfidf[1])
|
|
assert_less(tfidf[1], 2)
|
|
assert_less(tfidf[2], 3)
|
|
|
|
|
|
def test_vectorizer():
|
|
# raw documents as an iterator
|
|
train_data = iter(ALL_FOOD_DOCS[:-1])
|
|
test_data = [ALL_FOOD_DOCS[-1]]
|
|
n_train = len(ALL_FOOD_DOCS) - 1
|
|
|
|
# test without vocabulary
|
|
v1 = CountVectorizer(max_df=0.5)
|
|
counts_train = v1.fit_transform(train_data)
|
|
if hasattr(counts_train, 'tocsr'):
|
|
counts_train = counts_train.tocsr()
|
|
assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)
|
|
|
|
# build a vectorizer v1 with the same vocabulary as the one fitted by v1
|
|
v2 = CountVectorizer(vocabulary=v1.vocabulary_)
|
|
|
|
# compare that the two vectorizer give the same output on the test sample
|
|
for v in (v1, v2):
|
|
counts_test = v.transform(test_data)
|
|
if hasattr(counts_test, 'tocsr'):
|
|
counts_test = counts_test.tocsr()
|
|
|
|
vocabulary = v.vocabulary_
|
|
assert_equal(counts_test[0, vocabulary["salad"]], 1)
|
|
assert_equal(counts_test[0, vocabulary["tomato"]], 1)
|
|
assert_equal(counts_test[0, vocabulary["water"]], 1)
|
|
|
|
# stop word from the fixed list
|
|
assert_false("the" in vocabulary)
|
|
|
|
# stop word found automatically by the vectorizer DF thresholding
|
|
# words that are high frequent across the complete corpus are likely
|
|
# to be not informative (either real stop words of extraction
|
|
# artifacts)
|
|
assert_false("copyright" in vocabulary)
|
|
|
|
# not present in the sample
|
|
assert_equal(counts_test[0, vocabulary["coke"]], 0)
|
|
assert_equal(counts_test[0, vocabulary["burger"]], 0)
|
|
assert_equal(counts_test[0, vocabulary["beer"]], 0)
|
|
assert_equal(counts_test[0, vocabulary["pizza"]], 0)
|
|
|
|
# test tf-idf
|
|
t1 = TfidfTransformer(norm='l1')
|
|
tfidf = t1.fit(counts_train).transform(counts_train).toarray()
|
|
assert_equal(len(t1.idf_), len(v1.vocabulary_))
|
|
assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))
|
|
|
|
# test tf-idf with new data
|
|
tfidf_test = t1.transform(counts_test).toarray()
|
|
assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))
|
|
|
|
# test tf alone
|
|
t2 = TfidfTransformer(norm='l1', use_idf=False)
|
|
tf = t2.fit(counts_train).transform(counts_train).toarray()
|
|
assert_false(hasattr(t2, "idf_"))
|
|
|
|
# test idf transform with unlearned idf vector
|
|
t3 = TfidfTransformer(use_idf=True)
|
|
assert_raises(ValueError, t3.transform, counts_train)
|
|
|
|
# test idf transform with incompatible n_features
|
|
X = [[1, 1, 5],
|
|
[1, 1, 0]]
|
|
t3.fit(X)
|
|
X_incompt = [[1, 3],
|
|
[1, 3]]
|
|
assert_raises(ValueError, t3.transform, X_incompt)
|
|
|
|
# L1-normalized term frequencies sum to one
|
|
assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)
|
|
|
|
# test the direct tfidf vectorizer
|
|
# (equivalent to term count vectorizer + tfidf transformer)
|
|
train_data = iter(ALL_FOOD_DOCS[:-1])
|
|
tv = TfidfVectorizer(norm='l1')
|
|
|
|
tv.max_df = v1.max_df
|
|
tfidf2 = tv.fit_transform(train_data).toarray()
|
|
assert_false(tv.fixed_vocabulary_)
|
|
assert_array_almost_equal(tfidf, tfidf2)
|
|
|
|
# test the direct tfidf vectorizer with new data
|
|
tfidf_test2 = tv.transform(test_data).toarray()
|
|
assert_array_almost_equal(tfidf_test, tfidf_test2)
|
|
|
|
# test transform on unfitted vectorizer with empty vocabulary
|
|
v3 = CountVectorizer(vocabulary=None)
|
|
assert_raises(ValueError, v3.transform, train_data)
|
|
|
|
# ascii preprocessor?
|
|
v3.set_params(strip_accents='ascii', lowercase=False)
|
|
assert_equal(v3.build_preprocessor(), strip_accents_ascii)
|
|
|
|
# error on bad strip_accents param
|
|
v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
|
|
assert_raises(ValueError, v3.build_preprocessor)
|
|
|
|
# error with bad analyzer type
|
|
v3.set_params = '_invalid_analyzer_type_'
|
|
assert_raises(ValueError, v3.build_analyzer)
|
|
|
|
|
|
def test_tfidf_vectorizer_setters():
|
|
tv = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False,
|
|
sublinear_tf=False)
|
|
tv.norm = 'l1'
|
|
assert_equal(tv._tfidf.norm, 'l1')
|
|
tv.use_idf = True
|
|
assert_true(tv._tfidf.use_idf)
|
|
tv.smooth_idf = True
|
|
assert_true(tv._tfidf.smooth_idf)
|
|
tv.sublinear_tf = True
|
|
assert_true(tv._tfidf.sublinear_tf)
|
|
|
|
|
|
@ignore_warnings(category=DeprecationWarning)
|
|
def test_hashing_vectorizer():
|
|
v = HashingVectorizer()
|
|
X = v.transform(ALL_FOOD_DOCS)
|
|
token_nnz = X.nnz
|
|
assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
|
|
assert_equal(X.dtype, v.dtype)
|
|
|
|
# By default the hashed values receive a random sign and l2 normalization
|
|
# makes the feature values bounded
|
|
assert_true(np.min(X.data) > -1)
|
|
assert_true(np.min(X.data) < 0)
|
|
assert_true(np.max(X.data) > 0)
|
|
assert_true(np.max(X.data) < 1)
|
|
|
|
# Check that the rows are normalized
|
|
for i in range(X.shape[0]):
|
|
assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)
|
|
|
|
# Check vectorization with some non-default parameters
|
|
v = HashingVectorizer(ngram_range=(1, 2), non_negative=True, norm='l1')
|
|
X = v.transform(ALL_FOOD_DOCS)
|
|
assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
|
|
assert_equal(X.dtype, v.dtype)
|
|
|
|
# ngrams generate more non zeros
|
|
ngrams_nnz = X.nnz
|
|
assert_true(ngrams_nnz > token_nnz)
|
|
assert_true(ngrams_nnz < 2 * token_nnz)
|
|
|
|
# makes the feature values bounded
|
|
assert_true(np.min(X.data) > 0)
|
|
assert_true(np.max(X.data) < 1)
|
|
|
|
# Check that the rows are normalized
|
|
for i in range(X.shape[0]):
|
|
assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
|
|
|
|
|
|
def test_feature_names():
|
|
cv = CountVectorizer(max_df=0.5)
|
|
|
|
# test for Value error on unfitted/empty vocabulary
|
|
assert_raises(ValueError, cv.get_feature_names)
|
|
|
|
X = cv.fit_transform(ALL_FOOD_DOCS)
|
|
n_samples, n_features = X.shape
|
|
assert_equal(len(cv.vocabulary_), n_features)
|
|
|
|
feature_names = cv.get_feature_names()
|
|
assert_equal(len(feature_names), n_features)
|
|
assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza',
|
|
'salad', 'sparkling', 'tomato', 'water'],
|
|
feature_names)
|
|
|
|
for idx, name in enumerate(feature_names):
|
|
assert_equal(idx, cv.vocabulary_.get(name))
|
|
|
|
|
|
def test_vectorizer_max_features():
|
|
vec_factories = (
|
|
CountVectorizer,
|
|
TfidfVectorizer,
|
|
)
|
|
|
|
expected_vocabulary = set(['burger', 'beer', 'salad', 'pizza'])
|
|
expected_stop_words = set([u'celeri', u'tomato', u'copyright', u'coke',
|
|
u'sparkling', u'water', u'the'])
|
|
|
|
for vec_factory in vec_factories:
|
|
# test bounded number of extracted features
|
|
vectorizer = vec_factory(max_df=0.6, max_features=4)
|
|
vectorizer.fit(ALL_FOOD_DOCS)
|
|
assert_equal(set(vectorizer.vocabulary_), expected_vocabulary)
|
|
assert_equal(vectorizer.stop_words_, expected_stop_words)
|
|
|
|
|
|
def test_count_vectorizer_max_features():
|
|
# Regression test: max_features didn't work correctly in 0.14.
|
|
|
|
cv_1 = CountVectorizer(max_features=1)
|
|
cv_3 = CountVectorizer(max_features=3)
|
|
cv_None = CountVectorizer(max_features=None)
|
|
|
|
counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
|
|
counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
|
|
counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
|
|
|
|
features_1 = cv_1.get_feature_names()
|
|
features_3 = cv_3.get_feature_names()
|
|
features_None = cv_None.get_feature_names()
|
|
|
|
# The most common feature is "the", with frequency 7.
|
|
assert_equal(7, counts_1.max())
|
|
assert_equal(7, counts_3.max())
|
|
assert_equal(7, counts_None.max())
|
|
|
|
# The most common feature should be the same
|
|
assert_equal("the", features_1[np.argmax(counts_1)])
|
|
assert_equal("the", features_3[np.argmax(counts_3)])
|
|
assert_equal("the", features_None[np.argmax(counts_None)])
|
|
|
|
|
|
def test_vectorizer_max_df():
|
|
test_data = ['abc', 'dea', 'eat']
|
|
vect = CountVectorizer(analyzer='char', max_df=1.0)
|
|
vect.fit(test_data)
|
|
assert_true('a' in vect.vocabulary_.keys())
|
|
assert_equal(len(vect.vocabulary_.keys()), 6)
|
|
assert_equal(len(vect.stop_words_), 0)
|
|
|
|
vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5
|
|
vect.fit(test_data)
|
|
assert_true('a' not in vect.vocabulary_.keys()) # {ae} ignored
|
|
assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain
|
|
assert_true('a' in vect.stop_words_)
|
|
assert_equal(len(vect.stop_words_), 2)
|
|
|
|
vect.max_df = 1
|
|
vect.fit(test_data)
|
|
assert_true('a' not in vect.vocabulary_.keys()) # {ae} ignored
|
|
assert_equal(len(vect.vocabulary_.keys()), 4) # {bcdt} remain
|
|
assert_true('a' in vect.stop_words_)
|
|
assert_equal(len(vect.stop_words_), 2)
|
|
|
|
|
|
def test_vectorizer_min_df():
|
|
test_data = ['abc', 'dea', 'eat']
|
|
vect = CountVectorizer(analyzer='char', min_df=1)
|
|
vect.fit(test_data)
|
|
assert_true('a' in vect.vocabulary_.keys())
|
|
assert_equal(len(vect.vocabulary_.keys()), 6)
|
|
assert_equal(len(vect.stop_words_), 0)
|
|
|
|
vect.min_df = 2
|
|
vect.fit(test_data)
|
|
assert_true('c' not in vect.vocabulary_.keys()) # {bcdt} ignored
|
|
assert_equal(len(vect.vocabulary_.keys()), 2) # {ae} remain
|
|
assert_true('c' in vect.stop_words_)
|
|
assert_equal(len(vect.stop_words_), 4)
|
|
|
|
vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4
|
|
vect.fit(test_data)
|
|
assert_true('c' not in vect.vocabulary_.keys()) # {bcdet} ignored
|
|
assert_equal(len(vect.vocabulary_.keys()), 1) # {a} remains
|
|
assert_true('c' in vect.stop_words_)
|
|
assert_equal(len(vect.stop_words_), 5)
|
|
|
|
|
|
def test_count_binary_occurrences():
|
|
# by default multiple occurrences are counted as longs
|
|
test_data = ['aaabc', 'abbde']
|
|
vect = CountVectorizer(analyzer='char', max_df=1.0)
|
|
X = vect.fit_transform(test_data).toarray()
|
|
assert_array_equal(['a', 'b', 'c', 'd', 'e'], vect.get_feature_names())
|
|
assert_array_equal([[3, 1, 1, 0, 0],
|
|
[1, 2, 0, 1, 1]], X)
|
|
|
|
# using boolean features, we can fetch the binary occurrence info
|
|
# instead.
|
|
vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True)
|
|
X = vect.fit_transform(test_data).toarray()
|
|
assert_array_equal([[1, 1, 1, 0, 0],
|
|
[1, 1, 0, 1, 1]], X)
|
|
|
|
# check the ability to change the dtype
|
|
vect = CountVectorizer(analyzer='char', max_df=1.0,
|
|
binary=True, dtype=np.float32)
|
|
X_sparse = vect.fit_transform(test_data)
|
|
assert_equal(X_sparse.dtype, np.float32)
|
|
|
|
|
|
@ignore_warnings(category=DeprecationWarning)
|
|
def test_hashed_binary_occurrences():
|
|
# by default multiple occurrences are counted as longs
|
|
test_data = ['aaabc', 'abbde']
|
|
vect = HashingVectorizer(analyzer='char', non_negative=True,
|
|
norm=None)
|
|
X = vect.transform(test_data)
|
|
assert_equal(np.max(X[0:1].data), 3)
|
|
assert_equal(np.max(X[1:2].data), 2)
|
|
assert_equal(X.dtype, np.float64)
|
|
|
|
# using boolean features, we can fetch the binary occurrence info
|
|
# instead.
|
|
vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
|
|
norm=None)
|
|
X = vect.transform(test_data)
|
|
assert_equal(np.max(X.data), 1)
|
|
assert_equal(X.dtype, np.float64)
|
|
|
|
# check the ability to change the dtype
|
|
vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
|
|
norm=None, dtype=np.float64)
|
|
X = vect.transform(test_data)
|
|
assert_equal(X.dtype, np.float64)
|
|
|
|
|
|
def test_vectorizer_inverse_transform():
|
|
# raw documents
|
|
data = ALL_FOOD_DOCS
|
|
for vectorizer in (TfidfVectorizer(), CountVectorizer()):
|
|
transformed_data = vectorizer.fit_transform(data)
|
|
inversed_data = vectorizer.inverse_transform(transformed_data)
|
|
analyze = vectorizer.build_analyzer()
|
|
for doc, inversed_terms in zip(data, inversed_data):
|
|
terms = np.sort(np.unique(analyze(doc)))
|
|
inversed_terms = np.sort(np.unique(inversed_terms))
|
|
assert_array_equal(terms, inversed_terms)
|
|
|
|
# Test that inverse_transform also works with numpy arrays
|
|
transformed_data = transformed_data.toarray()
|
|
inversed_data2 = vectorizer.inverse_transform(transformed_data)
|
|
for terms, terms2 in zip(inversed_data, inversed_data2):
|
|
assert_array_equal(np.sort(terms), np.sort(terms2))
|
|
|
|
|
|
def test_count_vectorizer_pipeline_grid_selection():
|
|
# raw documents
|
|
data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
|
|
|
|
# label junk food as -1, the others as +1
|
|
target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)
|
|
|
|
# split the dataset for model development and final evaluation
|
|
train_data, test_data, target_train, target_test = train_test_split(
|
|
data, target, test_size=.2, random_state=0)
|
|
|
|
pipeline = Pipeline([('vect', CountVectorizer()),
|
|
('svc', LinearSVC())])
|
|
|
|
parameters = {
|
|
'vect__ngram_range': [(1, 1), (1, 2)],
|
|
'svc__loss': ('hinge', 'squared_hinge')
|
|
}
|
|
|
|
# find the best parameters for both the feature extraction and the
|
|
# classifier
|
|
grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)
|
|
|
|
# Check that the best model found by grid search is 100% correct on the
|
|
# held out evaluation set.
|
|
pred = grid_search.fit(train_data, target_train).predict(test_data)
|
|
assert_array_equal(pred, target_test)
|
|
|
|
# on this toy dataset bigram representation which is used in the last of
|
|
# the grid_search is considered the best estimator since they all converge
|
|
# to 100% accuracy models
|
|
assert_equal(grid_search.best_score_, 1.0)
|
|
best_vectorizer = grid_search.best_estimator_.named_steps['vect']
|
|
assert_equal(best_vectorizer.ngram_range, (1, 1))
|
|
|
|
|
|
def test_vectorizer_pipeline_grid_selection():
|
|
# raw documents
|
|
data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
|
|
|
|
# label junk food as -1, the others as +1
|
|
target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)
|
|
|
|
# split the dataset for model development and final evaluation
|
|
train_data, test_data, target_train, target_test = train_test_split(
|
|
data, target, test_size=.1, random_state=0)
|
|
|
|
pipeline = Pipeline([('vect', TfidfVectorizer()),
|
|
('svc', LinearSVC())])
|
|
|
|
parameters = {
|
|
'vect__ngram_range': [(1, 1), (1, 2)],
|
|
'vect__norm': ('l1', 'l2'),
|
|
'svc__loss': ('hinge', 'squared_hinge'),
|
|
}
|
|
|
|
# find the best parameters for both the feature extraction and the
|
|
# classifier
|
|
grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)
|
|
|
|
# Check that the best model found by grid search is 100% correct on the
|
|
# held out evaluation set.
|
|
pred = grid_search.fit(train_data, target_train).predict(test_data)
|
|
assert_array_equal(pred, target_test)
|
|
|
|
# on this toy dataset bigram representation which is used in the last of
|
|
# the grid_search is considered the best estimator since they all converge
|
|
# to 100% accuracy models
|
|
assert_equal(grid_search.best_score_, 1.0)
|
|
best_vectorizer = grid_search.best_estimator_.named_steps['vect']
|
|
assert_equal(best_vectorizer.ngram_range, (1, 1))
|
|
assert_equal(best_vectorizer.norm, 'l2')
|
|
assert_false(best_vectorizer.fixed_vocabulary_)
|
|
|
|
|
|
def test_vectorizer_pipeline_cross_validation():
|
|
# raw documents
|
|
data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
|
|
|
|
# label junk food as -1, the others as +1
|
|
target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)
|
|
|
|
pipeline = Pipeline([('vect', TfidfVectorizer()),
|
|
('svc', LinearSVC())])
|
|
|
|
cv_scores = cross_val_score(pipeline, data, target, cv=3)
|
|
assert_array_equal(cv_scores, [1., 1., 1.])
|
|
|
|
|
|
@ignore_warnings(category=DeprecationWarning)
|
|
def test_vectorizer_unicode():
|
|
# tests that the count vectorizer works with cyrillic.
|
|
document = (
|
|
"\xd0\x9c\xd0\xb0\xd1\x88\xd0\xb8\xd0\xbd\xd0\xbd\xd0\xbe\xd0"
|
|
"\xb5 \xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0"
|
|
"\xb5 \xe2\x80\x94 \xd0\xbe\xd0\xb1\xd1\x88\xd0\xb8\xd1\x80\xd0\xbd"
|
|
"\xd1\x8b\xd0\xb9 \xd0\xbf\xd0\xbe\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb7"
|
|
"\xd0\xb4\xd0\xb5\xd0\xbb \xd0\xb8\xd1\x81\xd0\xba\xd1\x83\xd1\x81"
|
|
"\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd0\xb3"
|
|
"\xd0\xbe \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0"
|
|
"\xb5\xd0\xba\xd1\x82\xd0\xb0, \xd0\xb8\xd0\xb7\xd1\x83\xd1\x87"
|
|
"\xd0\xb0\xd1\x8e\xd1\x89\xd0\xb8\xd0\xb9 \xd0\xbc\xd0\xb5\xd1\x82"
|
|
"\xd0\xbe\xd0\xb4\xd1\x8b \xd0\xbf\xd0\xbe\xd1\x81\xd1\x82\xd1\x80"
|
|
"\xd0\xbe\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xb0\xd0\xbb\xd0\xb3"
|
|
"\xd0\xbe\xd1\x80\xd0\xb8\xd1\x82\xd0\xbc\xd0\xbe\xd0\xb2, \xd1\x81"
|
|
"\xd0\xbf\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd1\x8b\xd1\x85 "
|
|
"\xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1"
|
|
"\x8f.")
|
|
|
|
vect = CountVectorizer()
|
|
X_counted = vect.fit_transform([document])
|
|
assert_equal(X_counted.shape, (1, 15))
|
|
|
|
vect = HashingVectorizer(norm=None, non_negative=True)
|
|
X_hashed = vect.transform([document])
|
|
assert_equal(X_hashed.shape, (1, 2 ** 20))
|
|
|
|
# No collisions on such a small dataset
|
|
assert_equal(X_counted.nnz, X_hashed.nnz)
|
|
|
|
# When norm is None and non_negative, the tokens are counted up to
|
|
# collisions
|
|
assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
|
|
|
|
|
|
def test_tfidf_vectorizer_with_fixed_vocabulary():
|
|
# non regression smoke test for inheritance issues
|
|
vocabulary = ['pizza', 'celeri']
|
|
vect = TfidfVectorizer(vocabulary=vocabulary)
|
|
X_1 = vect.fit_transform(ALL_FOOD_DOCS)
|
|
X_2 = vect.transform(ALL_FOOD_DOCS)
|
|
assert_array_almost_equal(X_1.toarray(), X_2.toarray())
|
|
assert_true(vect.fixed_vocabulary_)
|
|
|
|
|
|
def test_pickling_vectorizer():
|
|
instances = [
|
|
HashingVectorizer(),
|
|
HashingVectorizer(norm='l1'),
|
|
HashingVectorizer(binary=True),
|
|
HashingVectorizer(ngram_range=(1, 2)),
|
|
CountVectorizer(),
|
|
CountVectorizer(preprocessor=strip_tags),
|
|
CountVectorizer(analyzer=lazy_analyze),
|
|
CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
|
|
CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
|
|
TfidfVectorizer(),
|
|
TfidfVectorizer(analyzer=lazy_analyze),
|
|
TfidfVectorizer().fit(JUNK_FOOD_DOCS),
|
|
]
|
|
|
|
for orig in instances:
|
|
s = pickle.dumps(orig)
|
|
copy = pickle.loads(s)
|
|
assert_equal(type(copy), orig.__class__)
|
|
assert_equal(copy.get_params(), orig.get_params())
|
|
assert_array_equal(
|
|
copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
|
|
orig.fit_transform(JUNK_FOOD_DOCS).toarray())
|
|
|
|
|
|
def test_countvectorizer_vocab_sets_when_pickling():
|
|
# ensure that vocabulary of type set is coerced to a list to
|
|
# preserve iteration ordering after deserialization
|
|
rng = np.random.RandomState(0)
|
|
vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
|
|
'salad', 'sparkling', 'tomato', 'water'])
|
|
for x in range(0, 100):
|
|
vocab_set = set(rng.choice(vocab_words, size=5, replace=False))
|
|
cv = CountVectorizer(vocabulary=vocab_set)
|
|
unpickled_cv = pickle.loads(pickle.dumps(cv))
|
|
cv.fit(ALL_FOOD_DOCS)
|
|
unpickled_cv.fit(ALL_FOOD_DOCS)
|
|
assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
|
|
|
|
|
|
def test_countvectorizer_vocab_dicts_when_pickling():
|
|
rng = np.random.RandomState(0)
|
|
vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
|
|
'salad', 'sparkling', 'tomato', 'water'])
|
|
for x in range(0, 100):
|
|
vocab_dict = dict()
|
|
words = rng.choice(vocab_words, size=5, replace=False)
|
|
for y in range(0, 5):
|
|
vocab_dict[words[y]] = y
|
|
cv = CountVectorizer(vocabulary=vocab_dict)
|
|
unpickled_cv = pickle.loads(pickle.dumps(cv))
|
|
cv.fit(ALL_FOOD_DOCS)
|
|
unpickled_cv.fit(ALL_FOOD_DOCS)
|
|
assert_equal(cv.get_feature_names(), unpickled_cv.get_feature_names())
|
|
|
|
|
|
def test_stop_words_removal():
|
|
# Ensure that deleting the stop_words_ attribute doesn't affect transform
|
|
|
|
fitted_vectorizers = (
|
|
TfidfVectorizer().fit(JUNK_FOOD_DOCS),
|
|
CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
|
|
CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS)
|
|
)
|
|
|
|
for vect in fitted_vectorizers:
|
|
vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
|
|
|
vect.stop_words_ = None
|
|
stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
|
|
|
delattr(vect, 'stop_words_')
|
|
stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
|
|
|
|
assert_array_equal(stop_None_transform, vect_transform)
|
|
assert_array_equal(stop_del_transform, vect_transform)
|
|
|
|
|
|
def test_pickling_transformer():
|
|
X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
|
|
orig = TfidfTransformer().fit(X)
|
|
s = pickle.dumps(orig)
|
|
copy = pickle.loads(s)
|
|
assert_equal(type(copy), orig.__class__)
|
|
assert_array_equal(
|
|
copy.fit_transform(X).toarray(),
|
|
orig.fit_transform(X).toarray())
|
|
|
|
|
|
def test_non_unique_vocab():
|
|
vocab = ['a', 'b', 'c', 'a', 'a']
|
|
vect = CountVectorizer(vocabulary=vocab)
|
|
assert_raises(ValueError, vect.fit, [])
|
|
|
|
|
|
def test_hashingvectorizer_nan_in_docs():
|
|
# np.nan can appear when using pandas to load text fields from a csv file
|
|
# with missing values.
|
|
message = "np.nan is an invalid document, expected byte or unicode string."
|
|
exception = ValueError
|
|
|
|
def func():
|
|
hv = HashingVectorizer()
|
|
hv.fit_transform(['hello world', np.nan, 'hello hello'])
|
|
|
|
assert_raise_message(exception, message, func)
|
|
|
|
|
|
def test_tfidfvectorizer_binary():
|
|
# Non-regression test: TfidfVectorizer used to ignore its "binary" param.
|
|
v = TfidfVectorizer(binary=True, use_idf=False, norm=None)
|
|
assert_true(v.binary)
|
|
|
|
X = v.fit_transform(['hello world', 'hello hello']).toarray()
|
|
assert_array_equal(X.ravel(), [1, 1, 1, 0])
|
|
X2 = v.transform(['hello world', 'hello hello']).toarray()
|
|
assert_array_equal(X2.ravel(), [1, 1, 1, 0])
|
|
|
|
|
|
def test_tfidfvectorizer_export_idf():
|
|
vect = TfidfVectorizer(use_idf=True)
|
|
vect.fit(JUNK_FOOD_DOCS)
|
|
assert_array_almost_equal(vect.idf_, vect._tfidf.idf_)
|
|
|
|
|
|
def test_vectorizer_vocab_clone():
|
|
vect_vocab = TfidfVectorizer(vocabulary=["the"])
|
|
vect_vocab_clone = clone(vect_vocab)
|
|
vect_vocab.fit(ALL_FOOD_DOCS)
|
|
vect_vocab_clone.fit(ALL_FOOD_DOCS)
|
|
assert_equal(vect_vocab_clone.vocabulary_, vect_vocab.vocabulary_)
|
|
|
|
|
|
def test_vectorizer_string_object_as_input():
|
|
message = ("Iterable over raw text documents expected, "
|
|
"string object received.")
|
|
for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]:
|
|
assert_raise_message(
|
|
ValueError, message, vec.fit_transform, "hello world!")
|
|
assert_raise_message(
|
|
ValueError, message, vec.fit, "hello world!")
|
|
assert_raise_message(
|
|
ValueError, message, vec.transform, "hello world!")
|