laywerrobot/lib/python3.6/site-packages/gensim/test/test_keywords.py

#!/usr/bin/env python
# encoding: utf-8
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automated test to reproduce the results of Mihalcea and Tarau (2004).

Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm.
As a validation of the gensim implementation we reproduced its results
in this test.

"""

import os.path
import logging
import unittest

from gensim import utils
from gensim.summarization import keywords


class TestKeywordsTest(unittest.TestCase):

    def test_text_keywords(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate keywords
        generated_keywords = keywords(text, split=True)

        # To be compared to the reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
            kw = f.read().strip().split("\n")

        self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})

    def test_text_keywords_words(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate exactly 13 keywords
        generated_keywords = keywords(text, words=15, split=True)

        self.assertEqual(len(generated_keywords), 16)

    def test_text_keywords_pos(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate keywords using only certain parts of speech
        generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)

        # To be compared to the reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
            kw = f.read().strip().split("\n")

        self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})

    def test_text_summarization_raises_exception_on_short_input_text(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
            text = f.read()

        # Keeps the first 8 sentences to make the text shorter.
        text = "\n".join(text.split('\n')[:8])

        self.assertTrue(keywords(text) is not None)

    def test_keywords_ratio(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Check ratio parameter is well behaved.  Because length is taken on tokenized clean text
        # we just check that ratio 20% is twice as long as ratio 10%
        # Values of 10% and 20% were carefully selected for this test to avoid
        # numerical instabilities when several keywords have almost the same score
        selected_docs_12 = keywords(text, ratio=0.1, split=True)
        selected_docs_21 = keywords(text, ratio=0.2, split=True)

        self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)

    def test_text_keywords_with_small_graph(self):
        # regression test, we get graph 2x2 on this text
        text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious'
        kwds = keywords(text, words=1, split=True)
        self.assertTrue(len(kwds))


if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
    unittest.main()