laywerrobot/lib/python3.6/site-packages/gensim/test/test_keywords.py
2020-08-27 21:55:39 +02:00

101 lines
3.8 KiB
Python

#!/usr/bin/env python
# encoding: utf-8
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Automated test to reproduce the results of Mihalcea and Tarau (2004).
Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm.
As a validation of the gensim implementation we reproduced its results
in this test.
"""
import os.path
import logging
import unittest
from gensim import utils
from gensim.summarization import keywords
class TestKeywordsTest(unittest.TestCase):
def test_text_keywords(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()
# calculate keywords
generated_keywords = keywords(text, split=True)
# To be compared to the reference.
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
kw = f.read().strip().split("\n")
self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
def test_text_keywords_words(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()
# calculate exactly 13 keywords
generated_keywords = keywords(text, words=15, split=True)
self.assertEqual(len(generated_keywords), 16)
def test_text_keywords_pos(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()
# calculate keywords using only certain parts of speech
generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)
# To be compared to the reference.
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
kw = f.read().strip().split("\n")
self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})
def test_text_summarization_raises_exception_on_short_input_text(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
text = f.read()
# Keeps the first 8 sentences to make the text shorter.
text = "\n".join(text.split('\n')[:8])
self.assertTrue(keywords(text) is not None)
def test_keywords_ratio(self):
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
text = f.read()
# Check ratio parameter is well behaved. Because length is taken on tokenized clean text
# we just check that ratio 20% is twice as long as ratio 10%
# Values of 10% and 20% were carefully selected for this test to avoid
# numerical instabilities when several keywords have almost the same score
selected_docs_12 = keywords(text, ratio=0.1, split=True)
selected_docs_21 = keywords(text, ratio=0.2, split=True)
self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)
def test_text_keywords_with_small_graph(self):
# regression test, we get graph 2x2 on this text
text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious'
kwds = keywords(text, words=1, split=True)
self.assertTrue(len(kwds))
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()