laywerrobot/lib/python3.6/site-packages/gensim/test/test_text_analysis.py

import logging
import unittest

from gensim.corpora.dictionary import Dictionary
from gensim.topic_coherence.text_analysis import (
    InvertedIndexAccumulator, WordOccurrenceAccumulator, ParallelWordOccurrenceAccumulator,
    CorpusAccumulator)
from gensim.test.utils import common_texts


class BaseTestCases(object):

    class TextAnalyzerTestBase(unittest.TestCase):
        texts = [
            ['this', 'is', 'a'],
            ['test', 'document'],
            ['this', 'test', 'document'],
            ['test', 'test', 'this']
        ]
        token2id = {
            'this': 10,
            'is': 15,
            'a': 20,
            'test': 21,
            'document': 17
        }
        dictionary = Dictionary(texts)
        dictionary.token2id = token2id
        dictionary.id2token = {v: k for k, v in token2id.items()}
        top_ids = set(token2id.values())

        texts2 = common_texts + [['user', 'user']]
        dictionary2 = Dictionary(texts2)
        dictionary2.id2token = {v: k for k, v in dictionary2.token2id.items()}
        top_ids2 = set(dictionary2.token2id.values())

        accumulator_cls = None

        def init_accumulator(self):
            return self.accumulator_cls(self.top_ids, self.dictionary)

        def init_accumulator2(self):
            return self.accumulator_cls(self.top_ids2, self.dictionary2)

        def test_occurrence_counting(self):
            accumulator = self.init_accumulator().accumulate(self.texts, 3)
            self.assertEqual(3, accumulator.get_occurrences("this"))
            self.assertEqual(1, accumulator.get_occurrences("is"))
            self.assertEqual(1, accumulator.get_occurrences("a"))

            self.assertEqual(2, accumulator.get_co_occurrences("test", "document"))
            self.assertEqual(2, accumulator.get_co_occurrences("test", "this"))
            self.assertEqual(1, accumulator.get_co_occurrences("is", "a"))

        def test_occurrence_counting2(self):
            accumulator = self.init_accumulator2().accumulate(self.texts2, 110)
            self.assertEqual(2, accumulator.get_occurrences("human"))
            self.assertEqual(4, accumulator.get_occurrences("user"))
            self.assertEqual(3, accumulator.get_occurrences("graph"))
            self.assertEqual(3, accumulator.get_occurrences("trees"))

            cases = [
                (1, ("human", "interface")),
                (2, ("system", "user")),
                (2, ("graph", "minors")),
                (2, ("graph", "trees")),
                (4, ("user", "user")),
                (3, ("graph", "graph")),
                (0, ("time", "eps"))
            ]
            for expected_count, (word1, word2) in cases:
                # Verify co-occurrence counts are correct, regardless of word order.
                self.assertEqual(expected_count, accumulator.get_co_occurrences(word1, word2))
                self.assertEqual(expected_count, accumulator.get_co_occurrences(word2, word1))

                # Also verify that using token ids instead of tokens works the same.
                word_id1 = self.dictionary2.token2id[word1]
                word_id2 = self.dictionary2.token2id[word2]
                self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id1, word_id2))
                self.assertEqual(expected_count, accumulator.get_co_occurrences(word_id2, word_id1))

        def test_occurences_for_irrelevant_words(self):
            accumulator = self.init_accumulator().accumulate(self.texts, 2)
            with self.assertRaises(KeyError):
                accumulator.get_occurrences("irrelevant")
            with self.assertRaises(KeyError):
                accumulator.get_co_occurrences("test", "irrelevant")


class TestInvertedIndexAccumulator(BaseTestCases.TextAnalyzerTestBase):
    accumulator_cls = InvertedIndexAccumulator

    def test_accumulate1(self):
        accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\
            .accumulate(self.texts, 2)
        # [['this', 'is'], ['is', 'a'], ['test', 'document'], ['this', 'test'],
        #  ['test', 'document'], ['test', 'test'], ['test', 'this']]
        inverted_index = accumulator.index_to_dict()
        expected = {
            10: {0, 3, 6},
            15: {0, 1},
            20: {1},
            21: {2, 3, 4, 5, 6},
            17: {2, 4}
        }
        self.assertDictEqual(expected, inverted_index)

    def test_accumulate2(self):
        accumulator = InvertedIndexAccumulator(self.top_ids, self.dictionary)\
            .accumulate(self.texts, 3)
        # [['this', 'is', 'a'], ['test', 'document'], ['this', 'test', 'document'],
        #  ['test', 'test', 'this']
        inverted_index = accumulator.index_to_dict()
        expected = {
            10: {0, 2, 3},
            15: {0},
            20: {0},
            21: {1, 2, 3},
            17: {1, 2}
        }
        self.assertDictEqual(expected, inverted_index)


class TestWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase):
    accumulator_cls = WordOccurrenceAccumulator


class TestParallelWordOccurrenceAccumulator(BaseTestCases.TextAnalyzerTestBase):
    accumulator_cls = ParallelWordOccurrenceAccumulator

    def init_accumulator(self):
        return self.accumulator_cls(2, self.top_ids, self.dictionary)

    def init_accumulator2(self):
        return self.accumulator_cls(2, self.top_ids2, self.dictionary2)


class TestCorpusAnalyzer(unittest.TestCase):

    def setUp(self):
        self.dictionary = BaseTestCases.TextAnalyzerTestBase.dictionary
        self.top_ids = BaseTestCases.TextAnalyzerTestBase.top_ids
        self.corpus = \
            [self.dictionary.doc2bow(doc) for doc in BaseTestCases.TextAnalyzerTestBase.texts]

    def test_index_accumulation(self):
        accumulator = CorpusAccumulator(self.top_ids).accumulate(self.corpus)
        inverted_index = accumulator.index_to_dict()
        expected = {
            10: {0, 2, 3},
            15: {0},
            20: {0},
            21: {1, 2, 3},
            17: {1, 2}
        }
        self.assertDictEqual(expected, inverted_index)

        self.assertEqual(3, accumulator.get_occurrences(10))
        self.assertEqual(2, accumulator.get_occurrences(17))
        self.assertEqual(2, accumulator.get_co_occurrences(10, 21))
        self.assertEqual(1, accumulator.get_co_occurrences(10, 17))


if __name__ == '__main__':
    logging.root.setLevel(logging.WARNING)
    unittest.main()