#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2016 Radim Rehurek # Copyright (C) 2016 Olavur Mortensen # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ Automated tests for the author-topic model (AuthorTopicModel class). These tests are based on the unit tests of LDA; the classes are quite similar, and the tests needed are thus quite similar. """ import logging import unittest import numbers from os import remove import six import numpy as np from gensim.corpora import mmcorpus, Dictionary from gensim.models import atmodel from gensim import matutils from gensim.test import basetmtests from gensim.test.utils import (datapath, get_tmpfile, common_texts, common_dictionary as dictionary, common_corpus as corpus) from gensim.matutils import jensen_shannon # TODO: # Test that computing the bound on new unseen documents works as expected (this is somewhat different # in the author-topic model than in LDA). # Perhaps test that the bound increases, in general (i.e. in several of the tests below where it makes # sense. This is not tested in LDA either. Tests can also be made to check that automatic prior learning # increases the bound. # Test that models are compatiple across versions, as done in LdaModel. # Assign some authors randomly to the documents above. author2doc = { 'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7] } doc2author = { 0: ['john', 'jack'], 1: ['john', 'jill'], 2: ['john', 'jane', 'jack'], 3: ['john', 'jane', 'jill'], 4: ['john', 'jane', 'jack'], 5: ['john', 'jane', 'jill'], 6: ['john', 'jane', 'jack'], 7: ['jane', 'jill'], 8: ['jane', 'jack'] } # More data with new and old authors (to test update method). # Although the text is just a subset of the previous, the model # just sees it as completely new data. texts_new = common_texts[0:3] author2doc_new = {'jill': [0], 'bob': [0, 1], 'sally': [1, 2]} dictionary_new = Dictionary(texts_new) corpus_new = [dictionary_new.doc2bow(text) for text in texts_new] class TestAuthorTopicModel(unittest.TestCase, basetmtests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = atmodel.AuthorTopicModel self.model = self.class_(corpus, id2word=dictionary, author2doc=author2doc, num_topics=2, passes=100) def testTransform(self): passed = False # sometimes, training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(25): # restart at most 5 times # create the transformation model model = self.class_(id2word=dictionary, num_topics=2, passes=100, random_state=0) model.update(corpus, author2doc) jill_topics = model.get_author_topics('jill') # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's # output is sensitive to a lot of things, like the scheduling of the updates, or like the # author2id (because the random initialization changes when author2id changes). If it does # fail, simply be aware of whether we broke something, or if it just naturally changed the # output of the model slightly. vec = matutils.sparse2full(jill_topics, 2) # convert to dense vector, for easier equality tests expected = [0.91, 0.08] # must contain the same values, up to re-ordering passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) if passed: break logging.warning( "Author-topic model failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected) ) self.assertTrue(passed) def testBasic(self): # Check that training the model produces a positive topic vector for some author # Otherwise, many of the other tests are invalid. model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) self.assertTrue(all(jill_topics > 0)) def testAuthor2docMissing(self): # Check that the results are the same if author2doc is constructed automatically from doc2author. model = self.class_( corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0 ) model2 = self.class_( corpus, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0 ) # Compare Jill's topics before in both models. jill_topics = model.get_author_topics('jill') jill_topics2 = model2.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) self.assertTrue(np.allclose(jill_topics, jill_topics2)) def testDoc2authorMissing(self): # Check that the results are the same if doc2author is constructed automatically from author2doc. model = self.class_( corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0 ) model2 = self.class_( corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, random_state=0 ) # Compare Jill's topics before in both models. jill_topics = model.get_author_topics('jill') jill_topics2 = model2.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) self.assertTrue(np.allclose(jill_topics, jill_topics2)) def testUpdate(self): # Check that calling update after the model already has been trained works. model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) model.update() jill_topics2 = model.get_author_topics('jill') jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) # Did we learn something? self.assertFalse(all(np.equal(jill_topics, jill_topics2))) def testUpdateNewDataOldAuthor(self): # Check that calling update with new documents and/or authors after the model already has # been trained works. # Test an author that already existed in the old dataset. model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) model.update(corpus_new, author2doc_new) jill_topics2 = model.get_author_topics('jill') jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) # Did we learn more about Jill? self.assertFalse(all(np.equal(jill_topics, jill_topics2))) def testUpdateNewDataNewAuthor(self): # Check that calling update with new documents and/or authors after the model already has # been trained works. # Test a new author, that didn't exist in the old dataset. model = self.class_(corpus, author2doc=author2doc, id2word=dictionary, num_topics=2) model.update(corpus_new, author2doc_new) # Did we learn something about Sally? sally_topics = model.get_author_topics('sally') sally_topics = matutils.sparse2full(sally_topics, model.num_topics) self.assertTrue(all(sally_topics > 0)) def testSerialized(self): # Test the model using serialized corpora. Basic tests, plus test of update functionality. model = self.class_( self.corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, serialized=True, serialization_path=datapath('testcorpus_serialization.mm') ) jill_topics = model.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) self.assertTrue(all(jill_topics > 0)) model.update() jill_topics2 = model.get_author_topics('jill') jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) # Did we learn more about Jill? self.assertFalse(all(np.equal(jill_topics, jill_topics2))) model.update(corpus_new, author2doc_new) # Did we learn something about Sally? sally_topics = model.get_author_topics('sally') sally_topics = matutils.sparse2full(sally_topics, model.num_topics) self.assertTrue(all(sally_topics > 0)) # Delete the MmCorpus used for serialization inside the author-topic model. remove(datapath('testcorpus_serialization.mm')) def testTransformSerialized(self): # Same as testTransform, using serialized corpora. passed = False # sometimes, training gets stuck at a local minimum # in that case try re-training the model from scratch, hoping for a # better random initialization for i in range(25): # restart at most 5 times # create the transformation model model = self.class_( id2word=dictionary, num_topics=2, passes=100, random_state=0, serialized=True, serialization_path=datapath('testcorpus_serialization.mm') ) model.update(self.corpus, author2doc) jill_topics = model.get_author_topics('jill') # NOTE: this test may easily fail if the author-topic model is altered in any way. The model's # output is sensitive to a lot of things, like the scheduling of the updates, or like the # author2id (because the random initialization changes when author2id changes). If it does # fail, simply be aware of whether we broke something, or if it just naturally changed the # output of the model slightly. vec = matutils.sparse2full(jill_topics, 2) # convert to dense vector, for easier equality tests expected = [0.91, 0.08] # must contain the same values, up to re-ordering passed = np.allclose(sorted(vec), sorted(expected), atol=1e-1) # Delete the MmCorpus used for serialization inside the author-topic model. remove(datapath('testcorpus_serialization.mm')) if passed: break logging.warning( "Author-topic model failed to converge on attempt %i (got %s, expected %s)", i, sorted(vec), sorted(expected) ) self.assertTrue(passed) def testAlphaAuto(self): model1 = self.class_( corpus, author2doc=author2doc, id2word=dictionary, alpha='symmetric', passes=10, num_topics=2 ) modelauto = self.class_( corpus, author2doc=author2doc, id2word=dictionary, alpha='auto', passes=10, num_topics=2 ) # did we learn something? self.assertFalse(all(np.equal(model1.alpha, modelauto.alpha))) def testAlpha(self): kwargs = dict( author2doc=author2doc, id2word=dictionary, num_topics=2, alpha=None ) expected_shape = (2,) # should not raise anything self.class_(**kwargs) kwargs['alpha'] = 'symmetric' model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) self.assertTrue(all(model.alpha == np.array([0.5, 0.5]))) kwargs['alpha'] = 'asymmetric' model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) self.assertTrue(np.allclose(model.alpha, [0.630602, 0.369398])) kwargs['alpha'] = 0.3 model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) self.assertTrue(all(model.alpha == np.array([0.3, 0.3]))) kwargs['alpha'] = 3 model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) self.assertTrue(all(model.alpha == np.array([3, 3]))) kwargs['alpha'] = [0.3, 0.3] model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) self.assertTrue(all(model.alpha == np.array([0.3, 0.3]))) kwargs['alpha'] = np.array([0.3, 0.3]) model = self.class_(**kwargs) self.assertEqual(model.alpha.shape, expected_shape) self.assertTrue(all(model.alpha == np.array([0.3, 0.3]))) # all should raise an exception for being wrong shape kwargs['alpha'] = [0.3, 0.3, 0.3] self.assertRaises(AssertionError, self.class_, **kwargs) kwargs['alpha'] = [[0.3], [0.3]] self.assertRaises(AssertionError, self.class_, **kwargs) kwargs['alpha'] = [0.3] self.assertRaises(AssertionError, self.class_, **kwargs) kwargs['alpha'] = "gensim is cool" self.assertRaises(ValueError, self.class_, **kwargs) def testEtaAuto(self): model1 = self.class_( corpus, author2doc=author2doc, id2word=dictionary, eta='symmetric', passes=10, num_topics=2 ) modelauto = self.class_( corpus, author2doc=author2doc, id2word=dictionary, eta='auto', passes=10, num_topics=2 ) # did we learn something? self.assertFalse(all(np.equal(model1.eta, modelauto.eta))) def testEta(self): kwargs = dict( author2doc=author2doc, id2word=dictionary, num_topics=2, eta=None ) num_terms = len(dictionary) expected_shape = (num_terms,) # should not raise anything model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) self.assertTrue(all(model.eta == np.array([0.5] * num_terms))) kwargs['eta'] = 'symmetric' model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) self.assertTrue(all(model.eta == np.array([0.5] * num_terms))) kwargs['eta'] = 0.3 model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) self.assertTrue(all(model.eta == np.array([0.3] * num_terms))) kwargs['eta'] = 3 model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) self.assertTrue(all(model.eta == np.array([3] * num_terms))) kwargs['eta'] = [0.3] * num_terms model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) self.assertTrue(all(model.eta == np.array([0.3] * num_terms))) kwargs['eta'] = np.array([0.3] * num_terms) model = self.class_(**kwargs) self.assertEqual(model.eta.shape, expected_shape) self.assertTrue(all(model.eta == np.array([0.3] * num_terms))) # should be ok with num_topics x num_terms testeta = np.array([[0.5] * len(dictionary)] * 2) kwargs['eta'] = testeta self.class_(**kwargs) # all should raise an exception for being wrong shape kwargs['eta'] = testeta.reshape(tuple(reversed(testeta.shape))) self.assertRaises(AssertionError, self.class_, **kwargs) kwargs['eta'] = [0.3] self.assertRaises(AssertionError, self.class_, **kwargs) kwargs['eta'] = [0.3] * (num_terms + 1) self.assertRaises(AssertionError, self.class_, **kwargs) kwargs['eta'] = "gensim is cool" self.assertRaises(ValueError, self.class_, **kwargs) kwargs['eta'] = "asymmetric" self.assertRaises(ValueError, self.class_, **kwargs) def testTopTopics(self): top_topics = self.model.top_topics(corpus) for topic, score in top_topics: self.assertTrue(isinstance(topic, list)) self.assertTrue(isinstance(score, float)) for v, k in topic: self.assertTrue(isinstance(k, six.string_types)) self.assertTrue(isinstance(v, float)) def testGetTopicTerms(self): topic_terms = self.model.get_topic_terms(1) for k, v in topic_terms: self.assertTrue(isinstance(k, numbers.Integral)) self.assertTrue(isinstance(v, float)) def testGetAuthorTopics(self): model = self.class_( corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) ) author_topics = [] for a in model.id2author.values(): author_topics.append(model.get_author_topics(a)) for topic in author_topics: self.assertTrue(isinstance(topic, list)) for k, v in topic: self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) def testTermTopics(self): model = self.class_( corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) ) # check with word_type result = model.get_term_topics(2) for topic_no, probability in result: self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(probability, float)) # if user has entered word instead, check with word result = model.get_term_topics(str(model.id2word[2])) for topic_no, probability in result: self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(probability, float)) def testNewAuthorTopics(self): model = self.class_( corpus, author2doc=author2doc, id2word=dictionary, num_topics=2, passes=100, random_state=np.random.seed(0) ) author2doc_newauthor = {} author2doc_newauthor["test"] = [0, 1] model.update(corpus=corpus[0:2], author2doc=author2doc_newauthor) # temp save model state vars before get_new_author_topics is called state_gamma_len = len(model.state.gamma) author2doc_len = len(model.author2doc) author2id_len = len(model.author2id) id2author_len = len(model.id2author) doc2author_len = len(model.doc2author) new_author_topics = model.get_new_author_topics(corpus=corpus[0:2]) # sanity check for k, v in new_author_topics: self.assertTrue(isinstance(k, int)) self.assertTrue(isinstance(v, float)) # make sure topics are similar enough similarity = 1 / (1 + jensen_shannon(model["test"], new_author_topics)) self.assertTrue(similarity >= 0.9) # produce an error to test if rollback occurs with self.assertRaises(TypeError): model.get_new_author_topics(corpus=corpus[0]) # assure rollback was successful and the model state is as before self.assertEqual(state_gamma_len, len(model.state.gamma)) self.assertEqual(author2doc_len, len(model.author2doc)) self.assertEqual(author2id_len, len(model.author2id)) self.assertEqual(id2author_len, len(model.id2author)) self.assertEqual(doc2author_len, len(model.doc2author)) def testPasses(self): # long message includes the original error message with a custom one self.longMessage = True # construct what we expect when passes aren't involved test_rhots = [] model = self.class_(id2word=dictionary, chunksize=1, num_topics=2) def final_rhot(model): return pow(model.offset + (1 * model.num_updates) / model.chunksize, -model.decay) # generate 5 updates to test rhot on for _ in range(5): model.update(corpus, author2doc) test_rhots.append(final_rhot(model)) for passes in [1, 5, 10, 50, 100]: model = self.class_(id2word=dictionary, chunksize=1, num_topics=2, passes=passes) self.assertEqual(final_rhot(model), 1.0) # make sure the rhot matches the test after each update for test_rhot in test_rhots: model.update(corpus, author2doc) msg = "{}, {}, {}".format(passes, model.num_updates, model.state.numdocs) self.assertAlmostEqual(final_rhot(model), test_rhot, msg=msg) self.assertEqual(model.state.numdocs, len(corpus) * len(test_rhots)) self.assertEqual(model.num_updates, len(corpus) * len(test_rhots)) def testPersistence(self): fname = get_tmpfile('gensim_models_atmodel.tst') model = self.model model.save(fname) model2 = self.class_.load(fname) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) self.assertTrue(np.allclose(model.state.gamma, model2.state.gamma)) def testPersistenceIgnore(self): fname = get_tmpfile('gensim_models_atmodel_testPersistenceIgnore.tst') model = atmodel.AuthorTopicModel(corpus, author2doc=author2doc, num_topics=2) model.save(fname, ignore='id2word') model2 = atmodel.AuthorTopicModel.load(fname) self.assertTrue(model2.id2word is None) model.save(fname, ignore=['id2word']) model2 = atmodel.AuthorTopicModel.load(fname) self.assertTrue(model2.id2word is None) def testPersistenceCompressed(self): fname = get_tmpfile('gensim_models_atmodel.tst.gz') model = self.model model.save(fname) model2 = self.class_.load(fname, mmap=None) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) # Compare Jill's topics before and after save/load. jill_topics = model.get_author_topics('jill') jill_topics2 = model2.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) self.assertTrue(np.allclose(jill_topics, jill_topics2)) def testLargeMmap(self): fname = get_tmpfile('gensim_models_atmodel.tst') model = self.model # simulate storing large arrays separately model.save(fname, sep_limit=0) # test loading the large model arrays with mmap model2 = self.class_.load(fname, mmap='r') self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(isinstance(model2.expElogbeta, np.memmap)) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) # Compare Jill's topics before and after save/load. jill_topics = model.get_author_topics('jill') jill_topics2 = model2.get_author_topics('jill') jill_topics = matutils.sparse2full(jill_topics, model.num_topics) jill_topics2 = matutils.sparse2full(jill_topics2, model.num_topics) self.assertTrue(np.allclose(jill_topics, jill_topics2)) def testLargeMmapCompressed(self): fname = get_tmpfile('gensim_models_atmodel.tst.gz') model = self.model # simulate storing large arrays separately model.save(fname, sep_limit=0) # test loading the large model arrays with mmap self.assertRaises(IOError, self.class_.load, fname, mmap='r') def testDtypeBackwardCompatibility(self): atmodel_3_0_1_fname = datapath('atmodel_3_0_1_model') expected_topics = [(0, 0.068200842977296727), (1, 0.93179915702270333)] # save model to use in test # self.model.save(atmodel_3_0_1_fname) # load a model saved using a 3.0.1 version of Gensim model = self.class_.load(atmodel_3_0_1_fname) # and test it on a predefined document topics = model['jane'] self.assertTrue(np.allclose(expected_topics, topics)) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()