101 lines
3.8 KiB
Python
101 lines
3.8 KiB
Python
#!/usr/bin/env python
|
|
# encoding: utf-8
|
|
#
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""
|
|
Automated test to reproduce the results of Mihalcea and Tarau (2004).
|
|
|
|
Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm.
|
|
As a validation of the gensim implementation we reproduced its results
|
|
in this test.
|
|
|
|
"""
|
|
|
|
import os.path
|
|
import logging
|
|
import unittest
|
|
|
|
from gensim import utils
|
|
from gensim.summarization import keywords
|
|
|
|
|
|
class TestKeywordsTest(unittest.TestCase):
|
|
|
|
def test_text_keywords(self):
|
|
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
|
|
|
|
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
|
|
text = f.read()
|
|
|
|
# calculate keywords
|
|
generated_keywords = keywords(text, split=True)
|
|
|
|
# To be compared to the reference.
|
|
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
|
|
kw = f.read().strip().split("\n")
|
|
|
|
self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
|
|
|
|
def test_text_keywords_words(self):
|
|
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
|
|
|
|
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
|
|
text = f.read()
|
|
|
|
# calculate exactly 13 keywords
|
|
generated_keywords = keywords(text, words=15, split=True)
|
|
|
|
self.assertEqual(len(generated_keywords), 16)
|
|
|
|
def test_text_keywords_pos(self):
|
|
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
|
|
|
|
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
|
|
text = f.read()
|
|
|
|
# calculate keywords using only certain parts of speech
|
|
generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)
|
|
|
|
# To be compared to the reference.
|
|
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
|
|
kw = f.read().strip().split("\n")
|
|
|
|
self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})
|
|
|
|
def test_text_summarization_raises_exception_on_short_input_text(self):
|
|
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
|
|
|
|
with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f:
|
|
text = f.read()
|
|
|
|
# Keeps the first 8 sentences to make the text shorter.
|
|
text = "\n".join(text.split('\n')[:8])
|
|
|
|
self.assertTrue(keywords(text) is not None)
|
|
|
|
def test_keywords_ratio(self):
|
|
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
|
|
|
|
with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
|
|
text = f.read()
|
|
|
|
# Check ratio parameter is well behaved. Because length is taken on tokenized clean text
|
|
# we just check that ratio 20% is twice as long as ratio 10%
|
|
# Values of 10% and 20% were carefully selected for this test to avoid
|
|
# numerical instabilities when several keywords have almost the same score
|
|
selected_docs_12 = keywords(text, ratio=0.1, split=True)
|
|
selected_docs_21 = keywords(text, ratio=0.2, split=True)
|
|
|
|
self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)
|
|
|
|
def test_text_keywords_with_small_graph(self):
|
|
# regression test, we get graph 2x2 on this text
|
|
text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious'
|
|
kwds = keywords(text, words=1, split=True)
|
|
self.assertTrue(len(kwds))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
|
|
unittest.main()
|