#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ Automated tests for checking transformation algorithms (the models package). """ import logging import unittest import six import numpy as np from gensim.utils import to_unicode from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser from gensim.models.phrases import pseudocorpus, original_scorer from gensim.test.utils import common_texts, temporary_file, datapath class TestUtils(unittest.TestCase): def test_pseudocorpus_no_common_terms(self): vocab = [ "prime_minister", "gold", "chief_technical_officer", "effective"] result = list(pseudocorpus(vocab, "_")) self.assertEqual( result, [["prime", "minister"], ["chief", "technical_officer"], ["chief_technical", "officer"]]) def test_pseudocorpus_with_common_terms(self): vocab = [ "hall_of_fame", "gold", "chief_of_political_bureau", "effective", "beware_of_the_dog_in_the_yard"] common_terms = frozenset(["in", "the", "of"]) result = list(pseudocorpus(vocab, "_", common_terms=common_terms)) self.assertEqual( result, [["hall", "of", "fame"], ["chief", "of", "political_bureau"], ["chief_of_political", "bureau"], ["beware", "of", "the", "dog_in_the_yard"], ["beware_of_the_dog", "in", "the", "yard"]]) class TestPhraseAnalysis(unittest.TestCase): class AnalysisTester(SentenceAnalyzer): def __init__(self, scores): self.scores = scores def score_item(self, worda, wordb, components, scorer): """Override for test purpose""" if worda is not None and wordb is not None: bigram_word = b"_".join(components) return self.scores.get(bigram_word, -1) else: return -1 def analyze(self, scores, sentence): analyzer = self.AnalysisTester(scores) return list(analyzer.analyze_sentence( sentence, threshold=1, common_terms={b"a", b"the", b"with", b"of"}, scorer=None)) def analyze_words(self, scores, sentence): result = ( w if isinstance(w, (tuple, list)) else [w] for w, score in self.analyze(scores, sentence)) return [b"_".join(w).decode("utf-8") for w in result] def test_simple_analysis(self): s = ["simple", "sentence", "should", "pass"] result = self.analyze_words({}, s) self.assertEqual(result, s) s = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"] result = self.analyze_words({}, s) self.assertEqual(result, s) def test_analysis_bigrams(self): scores = { b"simple_sentence": 2, b"sentence_many": 2, b"many_possible": 2, b"possible_bigrams": 2} s = ["simple", "sentence", "many", "possible", "bigrams"] result = self.analyze_words(scores, s) self.assertEqual(result, ["simple_sentence", "many_possible", "bigrams"]) s = ["some", "simple", "sentence", "many", "bigrams"] result = self.analyze_words(scores, s) self.assertEqual(result, ["some", "simple_sentence", "many", "bigrams"]) s = ["some", "unrelated", "simple", "words"] result = self.analyze_words(scores, s) self.assertEqual(result, s) def test_analysis_common_terms(self): scores = { b"simple_sentence": 2, b"sentence_many": 2, b"many_possible": 2, b"possible_bigrams": 2} s = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"] result = self.analyze_words(scores, s) self.assertEqual(result, ["a", "simple_sentence", "many", "the", "possible_bigrams"]) s = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"] result = self.analyze_words(scores, s) self.assertEqual(result, [ "simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"]) def test_analysis_common_terms_in_between(self): scores = { b"simple_sentence": 2, b"sentence_with_many": 2, b"many_possible": 2, b"many_of_the_possible": 2, b"possible_bigrams": 2} s = ["sentence", "with", "many", "possible", "bigrams"] result = self.analyze_words(scores, s) self.assertEqual(result, ["sentence_with_many", "possible_bigrams"]) s = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"] result = self.analyze_words(scores, s) self.assertEqual( result, ["a", "simple_sentence", "with", "many_of_the_possible", "bigrams", "with"]) class PhrasesData: sentences = common_texts + [ ['graph', 'minors', 'survey', 'human', 'interface'] ] unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences] common_terms = frozenset() bigram1 = u'response_time' bigram2 = u'graph_minors' bigram3 = u'human_interface' def gen_sentences(self): return ((w for w in sentence) for sentence in self.sentences) class PhrasesCommon: """ Tests that need to be run for both Phrases and Phraser classes.""" def setUp(self): self.bigram = Phrases( self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_default = Phrases( self.sentences, common_terms=self.common_terms) self.bigram_utf8 = Phrases( self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_unicode = Phrases( self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) def testEmptyPhrasifiedSentencesIterator(self): bigram_phrases = Phrases(self.sentences) bigram_phraser = Phraser(bigram_phrases) trigram_phrases = Phrases(bigram_phraser[self.sentences]) trigram_phraser = Phraser(trigram_phrases) trigrams = trigram_phraser[bigram_phraser[self.sentences]] fst, snd = list(trigrams), list(trigrams) self.assertEqual(fst, snd) self.assertNotEqual(snd, []) def testEmptyInputsOnBigramConstruction(self): """Test that empty inputs don't throw errors and return the expected result.""" # Empty list -> empty list self.assertEqual(list(self.bigram_default[[]]), []) # Empty iterator -> empty list self.assertEqual(list(self.bigram_default[iter(())]), []) # List of empty list -> list of empty list self.assertEqual(list(self.bigram_default[[[], []]]), [[], []]) # Iterator of empty list -> list of empty list self.assertEqual(list(self.bigram_default[iter([[], []])]), [[], []]) # Iterator of empty iterator -> list of empty list self.assertEqual(list(self.bigram_default[(iter(()) for i in range(2))]), [[], []]) def testSentenceGeneration(self): """Test basic bigram using a dummy corpus.""" # test that we generate the same amount of sentences as the input self.assertEqual(len(self.sentences), len(list(self.bigram_default[self.sentences]))) def testSentenceGenerationWithGenerator(self): """Test basic bigram production when corpus is a generator.""" self.assertEqual(len(list(self.gen_sentences())), len(list(self.bigram_default[self.gen_sentences()]))) def testBigramConstruction(self): """Test Phrases bigram construction building.""" # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False for s in self.bigram[self.sentences]: if not bigram1_seen and self.bigram1 in s: bigram1_seen = True if not bigram2_seen and self.bigram2 in s: bigram2_seen = True if bigram1_seen and bigram2_seen: break self.assertTrue(bigram1_seen and bigram2_seen) # check the same thing, this time using single doc transformation # last sentence should contain both graph_minors and human_interface self.assertTrue(self.bigram1 in self.bigram[self.sentences[1]]) self.assertTrue(self.bigram1 in self.bigram[self.sentences[4]]) self.assertTrue(self.bigram2 in self.bigram[self.sentences[-2]]) self.assertTrue(self.bigram2 in self.bigram[self.sentences[-1]]) self.assertTrue(self.bigram3 in self.bigram[self.sentences[-1]]) def testBigramConstructionFromGenerator(self): """Test Phrases bigram construction building when corpus is a generator""" bigram1_seen = False bigram2_seen = False for s in self.bigram[self.gen_sentences()]: if not bigram1_seen and self.bigram1 in s: bigram1_seen = True if not bigram2_seen and self.bigram2 in s: bigram2_seen = True if bigram1_seen and bigram2_seen: break self.assertTrue(bigram1_seen and bigram2_seen) def testBigramConstructionFromArray(self): """Test Phrases bigram construction building when corpus is a numpy array""" bigram1_seen = False bigram2_seen = False for s in self.bigram[np.array(self.sentences)]: if not bigram1_seen and self.bigram1 in s: bigram1_seen = True if not bigram2_seen and self.bigram2 in s: bigram2_seen = True if bigram1_seen and bigram2_seen: break self.assertTrue(bigram1_seen and bigram2_seen) def testEncoding(self): """Test that both utf8 and unicode input work; output must be unicode.""" expected = [u'survey', u'user', u'computer', u'system', u'response_time'] self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) self.assertTrue(isinstance(transformed, six.text_type)) # scorer for testCustomScorer # function is outside of the scope of the test because for picklability of custom scorer # Phrases tests for picklability # all scores will be 1 def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): return 1 class TestPhrasesModel(PhrasesData, PhrasesCommon, unittest.TestCase): def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() for phrase, score in bigram.export_phrases(self.sentences): seen_bigrams.add(phrase) assert seen_bigrams == { b'response time', b'graph minors', b'human interface', } def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == {b'graph minors', b'human interface'} def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == { 5.167, # score for graph minors 3.444 # score for human interface } def test__getitem__(self): """ test Phrases[sentences] with a single sentence""" bigram = Phrases(self.sentences, min_count=1, threshold=1) # pdb.set_trace() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] phrased_sentence = next(bigram[test_sentences].__iter__()) assert phrased_sentence == ['graph_minors', 'survey', 'human_interface'] def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == { .882, # score for graph minors .714 # score for human interface } def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' def testBadParameters(self): """Test the phrases module with bad parameters.""" # should fail with something less or equal than 0 self.assertRaises(ValueError, Phrases, self.sentences, min_count=0) # threshold should be positive self.assertRaises(ValueError, Phrases, self.sentences, threshold=-1) def testPruning(self): """Test that max_vocab_size parameter is respected.""" bigram = Phrases(self.sentences, max_vocab_size=5) self.assertTrue(len(bigram.vocab) <= 5) # endclass TestPhrasesModel class TestPhrasesPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' def testSaveLoad(self): """ Saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) def testSaveLoadStringScoring(self): """ Saving and loading a Phrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phrases""" bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl")) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl")) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phrases, before common_terms""" bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 phraser = Phraser(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise class TestPhraserPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): """Saving and loading a Phraser object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phraser( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, dumb_scorer) def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], ['graph_minors', 'survey', 'human_interface', 'system']) def testSaveLoadStringScoring(self): """ Saving and loading a Phraser object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phraser""" bigram_loaded = Phraser.load(datapath("phraser-scoring-str.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoScoring(self): """ Saving and loading a Phraser object with no scoring parameter. This should ensure backwards compatibility with old versions of Phraser""" bigram_loaded = Phraser.load(datapath("phraser-no-scoring.pkl")) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, original_scorer) def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phraser, before common_terms""" bigram_loaded = Phraser.load(datapath("phraser-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): """ Test Phraser models.""" def setUp(self): """Set up Phraser models for the tests.""" bigram_phrases = Phrases( self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram = Phraser(bigram_phrases) bigram_default_phrases = Phrases(self.sentences, common_terms=self.common_terms) self.bigram_default = Phraser(bigram_default_phrases) bigram_utf8_phrases = Phrases( self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_utf8 = Phraser(bigram_utf8_phrases) bigram_unicode_phrases = Phrases( self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_unicode = Phraser(bigram_unicode_phrases) class CommonTermsPhrasesData: """This mixin permits to reuse the test, using, this time the common_terms option """ sentences = [ ['human', 'interface', 'with', 'computer'], ['survey', 'of', 'user', 'computer', 'system', 'lack', 'of', 'interest'], ['eps', 'user', 'interface', 'system'], ['system', 'and', 'human', 'system', 'eps'], ['user', 'lack', 'of', 'interest'], ['trees'], ['graph', 'of', 'trees'], ['data', 'and', 'graph', 'of', 'trees'], ['data', 'and', 'graph', 'survey'], ['data', 'and', 'graph', 'survey', 'for', 'human', 'interface'] # test bigrams within same sentence ] unicode_sentences = [[to_unicode(w) for w in sentence] for sentence in sentences] common_terms = ['of', 'and', 'for'] bigram1 = u'lack_of_interest' bigram2 = u'data_and_graph' bigram3 = u'human_interface' expression1 = u'lack of interest' expression2 = u'data and graph' expression3 = u'human interface' def gen_sentences(self): return ((w for w in sentence) for sentence in self.sentences) class TestPhrasesModelCommonTerms(CommonTermsPhrasesData, TestPhrasesModel): """Test Phrases models with common terms""" def testEncoding(self): """Test that both utf8 and unicode input work; output must be unicode.""" expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) self.assertTrue(isinstance(transformed, six.text_type)) def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'data and graph', b'human interface', ]) def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() for phrase, score in bigram.export_phrases(self.sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'human interface', b'graph of trees', b'data and graph', b'lack of interest', ]) def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_scores = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) graph = float(bigram.vocab[b"graph"]) data = float(bigram.vocab[b"data"]) data_and_graph = float(bigram.vocab[b"data_and_graph"]) human = float(bigram.vocab[b"human"]) interface = float(bigram.vocab[b"interface"]) human_interface = float(bigram.vocab[b"human_interface"]) assert seen_scores == set([ # score for data and graph round((data_and_graph - min_count) / data / graph * len_vocab, 3), # score for human interface round((human_interface - min_count) / human / interface * len_vocab, 3), ]) def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi', common_terms=self.common_terms) seen_scores = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ .74, # score for data and graph .894 # score for human interface ]) def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer, common_terms=self.common_terms) seen_scores = [] test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 2 # 'data and graph' 'survey for human' def test__getitem__(self): """ test Phrases[sentences] with a single sentence""" bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) # pdb.set_trace() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] phrased_sentence = next(bigram[test_sentences].__iter__()) assert phrased_sentence == ['data_and_graph', 'survey', 'for', 'human_interface'] class TestPhraserModelCommonTerms(CommonTermsPhrasesData, TestPhraserModel): def testEncoding(self): """Test that both utf8 and unicode input work; output must be unicode.""" expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) self.assertTrue(isinstance(transformed, six.text_type)) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()