laywerrobot/lib/python3.6/site-packages/gensim/test/test_tfidfmodel.py
2020-08-27 21:55:39 +02:00

340 lines
14 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Automated tests for checking transformation algorithms (the models package).
"""
import logging
import unittest
import numpy as np
from gensim.corpora.mmcorpus import MmCorpus
from gensim.models import tfidfmodel
from gensim.test.utils import datapath, get_tmpfile, common_dictionary, common_corpus
from gensim.corpora import Dictionary
texts = [
['complier', 'system', 'computer'],
['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
['graph', 'flow', 'network', 'graph'],
['loading', 'computer', 'system'],
['user', 'server', 'system'],
['tree', 'hamiltonian'],
['graph', 'trees'],
['computer', 'kernel', 'malfunction', 'computer'],
['server', 'system', 'computer'],
]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
class TestTfidfModel(unittest.TestCase):
def setUp(self):
self.corpus = MmCorpus(datapath('testcorpus.mm'))
def test_transform(self):
# create the transformation model
model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
# transform one document
doc = list(self.corpus)[0]
transformed = model[doc]
expected = [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
self.assertTrue(np.allclose(transformed, expected))
def test_init(self):
# create the transformation model by analyzing a corpus
# uses the global `corpus`!
model1 = tfidfmodel.TfidfModel(common_corpus)
dfs = common_dictionary.dfs
# make sure the dfs<->idfs transformation works
self.assertEqual(model1.dfs, dfs)
self.assertEqual(model1.idfs, tfidfmodel.precompute_idfs(model1.wglobal, dfs, len(common_corpus)))
# create the transformation model by directly supplying a term->docfreq
# mapping from the global var `dictionary`.
model2 = tfidfmodel.TfidfModel(dictionary=common_dictionary)
self.assertEqual(model1.idfs, model2.idfs)
def test_persistence(self):
# Test persistence without using `smartirs`
fname = get_tmpfile('gensim_models.tst')
model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
model.save(fname)
model2 = tfidfmodel.TfidfModel.load(fname)
self.assertTrue(model.idfs == model2.idfs)
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
# Test persistence with using `smartirs`
fname = get_tmpfile('gensim_models_smartirs.tst')
model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model.save(fname)
model2 = tfidfmodel.TfidfModel.load(fname)
self.assertTrue(model.idfs == model2.idfs)
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
# Test persistence between Gensim v3.2.0 and current model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector
# Test persistence with using pivoted normalization
fname = get_tmpfile('gensim_models_smartirs.tst')
model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model.save(fname)
model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
self.assertTrue(model.idfs == model2.idfs)
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
# Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
def test_persistence_compressed(self):
# Test persistence without using `smartirs`
fname = get_tmpfile('gensim_models.tst.gz')
model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
model.save(fname)
model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
self.assertTrue(model.idfs == model2.idfs)
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
# Test persistence with using `smartirs`
fname = get_tmpfile('gensim_models_smartirs.tst.gz')
model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model.save(fname)
model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
self.assertTrue(model.idfs == model2.idfs)
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
# Test persistence between Gensim v3.2.0 and current compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector
# Test persistence with using pivoted normalization
fname = get_tmpfile('gensim_models_smartirs.tst.gz')
model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model.save(fname)
model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
self.assertTrue(model.idfs == model2.idfs)
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
# Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
def test_consistency(self):
docs = [corpus[1], corpus[2]]
# Test if `ntc` yields the default docs.
model = tfidfmodel.TfidfModel(corpus, smartirs='ntc')
transformed_docs = [model[docs[0]], model[docs[1]]]
model = tfidfmodel.TfidfModel(corpus)
expected_docs = [model[docs[0]], model[docs[1]]]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# Testing all the variations of `wlocal`
# nnn
model = tfidfmodel.TfidfModel(corpus, smartirs='nnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = docs[:]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# lnn
model = tfidfmodel.TfidfModel(corpus, smartirs='lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)],
[(5, 2.0), (9, 1.0), (10, 1.0)]
]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# ann
model = tfidfmodel.TfidfModel(corpus, smartirs='ann')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)],
[(5, 1.0), (9, 0.75), (10, 0.75)]
]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# bnn
model = tfidfmodel.TfidfModel(corpus, smartirs='bnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
[(5, 1), (9, 1), (10, 1)]
]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# Lnn
model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0),
(7, 1.0), (8, 1.0)
],
[
(5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526)
]
]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# Testing all the variations of `glocal`
# ntn
model = tfidfmodel.TfidfModel(corpus, smartirs='ntn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 3.169925001442312), (4, 3.169925001442312), (5, 1.584962500721156), (6, 3.169925001442312),
(7, 3.169925001442312), (8, 2.169925001442312)
],
[
(5, 3.169925001442312), (9, 3.169925001442312), (10, 3.169925001442312)
]
]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# npn
model = tfidfmodel.TfidfModel(corpus, smartirs='npn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0),
(7, 3.0), (8, 1.8073549220576042)
],
[
(5, 2.0), (9, 3.0), (10, 3.0)
]
]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
# Testing all the variations of `normalize`
# nnc
model = tfidfmodel.TfidfModel(corpus, smartirs='nnc')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 0.4082482905), (4, 0.4082482905), (5, 0.4082482905), (6, 0.4082482905),
(7, 0.4082482905), (8, 0.4082482905)
],
[
(5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302)
]
]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
transformed_docs = [model[docs[0]], model[docs[1]]]
model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
expected_docs = [model[docs[0]], model[docs[1]]]
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
def test_pivoted_normalization(self):
docs = [corpus[1], corpus[2]]
# Test if slope=1 yields the default docs for pivoted normalization.
model = tfidfmodel.TfidfModel(self.corpus)
transformed_docs = [model[docs[0]], model[docs[1]]]
model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
expected_docs = [model[docs[0]], model[docs[1]]]
self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
# Test if pivoted model is consistent
model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=0.5)
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(8, 0.8884910505493495), (7, 0.648974041227711), (6, 0.8884910505493495),
(5, 0.648974041227711), (4, 0.8884910505493495), (3, 0.8884910505493495)
],
[
(10, 0.8164965809277263), (9, 0.8164965809277263), (5, 1.6329931618554525)
]
]
self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()