laywerrobot/lib/python3.6/site-packages/gensim/summarization/summarizer.py

444 lines
14 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module provides functions for summarizing texts. Summarizing is based on
ranks of text sentences using a variation of the TextRank algorithm [1]_.
.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016).
Variations of the Similarity Function of TextRank for Automated Summarization,
https://arxiv.org/abs/1602.03606
Data
----
.. data:: INPUT_MIN_LENGTH - Minimal number of sentences in text
.. data:: WEIGHT_THRESHOLD - Minimal weight of edge between graph nodes. Smaller weights set to zero.
Example
-------
>>> from gensim.summarization.summarizer import summarize
>>> text = '''Rice Pudding - Poem by Alan Alexander Milne
... What is the matter with Mary Jane?
... She's crying with all her might and main,
... And she won't eat her dinner - rice pudding again -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her dolls and a daisy-chain,
... And a book about animals - all in vain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well, and she hasn't a pain;
... But, look at her, now she's beginning again! -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her sweets and a ride in the train,
... And I've begged her to stop for a bit and explain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well and she hasn't a pain,
... And it's lovely rice pudding for dinner again!
... What is the matter with Mary Jane?'''
>>> print(summarize(text))
And she won't eat her dinner - rice pudding again -
I've promised her dolls and a daisy-chain,
I've promised her sweets and a ride in the train,
And it's lovely rice pudding for dinner again!
"""
import logging
from gensim.utils import deprecated
from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank
from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from gensim.summarization.commons import build_graph as _build_graph
from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights
from gensim.corpora import Dictionary
from math import log10 as _log10
from six.moves import xrange
INPUT_MIN_LENGTH = 10
WEIGHT_THRESHOLD = 1.e-3
logger = logging.getLogger(__name__)
def _set_graph_edge_weights(graph):
"""Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all weights are fairly small,
forces all weights to 1, inplace.
Parameters
----------
graph : :class:`~gensim.summarization.graph.Graph`
Given graph.
"""
documents = graph.nodes()
weights = _bm25_weights(documents)
for i in xrange(len(documents)):
for j in xrange(len(documents)):
if i == j or weights[i][j] < WEIGHT_THRESHOLD:
continue
sentence_1 = documents[i]
sentence_2 = documents[j]
edge_1 = (sentence_1, sentence_2)
edge_2 = (sentence_2, sentence_1)
if not graph.has_edge(edge_1):
graph.add_edge(edge_1, weights[i][j])
if not graph.has_edge(edge_2):
graph.add_edge(edge_2, weights[j][i])
# Handles the case in which all similarities are zero.
# The resultant summary will consist of random sentences.
if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
_create_valid_graph(graph)
def _create_valid_graph(graph):
"""Sets all weights of edges for different edges as 1, inplace.
Parameters
----------
graph : :class:`~gensim.summarization.graph.Graph`
Given graph.
"""
nodes = graph.nodes()
for i in xrange(len(nodes)):
for j in xrange(len(nodes)):
if i == j:
continue
edge = (nodes[i], nodes[j])
if graph.has_edge(edge):
graph.del_edge(edge)
graph.add_edge(edge, 1)
@deprecated("Function will be removed in 4.0.0")
def _get_doc_length(doc):
"""Get length of (tokenized) document.
Parameters
----------
doc : list of (list of (tuple of int))
Given document.
Returns
-------
int
Length of document.
"""
return sum([item[1] for item in doc])
@deprecated("Function will be removed in 4.0.0")
def _get_similarity(doc1, doc2, vec1, vec2):
"""Returns similarity of two documents.
Parameters
----------
doc1 : list of (list of (tuple of int))
First document.
doc2 : list of (list of (tuple of int))
Second document.
vec1 : array
? of first document.
vec1 : array
? of secont document.
Returns
-------
float
Similarity of two documents.
"""
numerator = vec1.dot(vec2.transpose()).toarray()[0][0]
length_1 = _get_doc_length(doc1)
length_2 = _get_doc_length(doc2)
denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0
return numerator / denominator if denominator != 0 else 0
def _build_corpus(sentences):
"""Construct corpus from provided sentences.
Parameters
----------
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Given sentences.
Returns
-------
list of list of (int, int)
Corpus built from sentences.
"""
split_tokens = [sentence.token.split() for sentence in sentences]
dictionary = Dictionary(split_tokens)
return [dictionary.doc2bow(token) for token in split_tokens]
def _get_important_sentences(sentences, corpus, important_docs):
"""Get most important sentences.
Parameters
----------
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Given sentences.
corpus : list of list of (int, int)
Provided corpus.
important_docs : list of list of (int, int)
Most important documents of the corpus.
Returns
-------
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Most important sentences.
"""
hashable_corpus = _build_hasheable_corpus(corpus)
sentences_by_corpus = dict(zip(hashable_corpus, sentences))
return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs]
def _get_sentences_with_word_count(sentences, word_count):
"""Get list of sentences. Total number of returned words close to specified `word_count`.
Parameters
----------
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Given sentences.
word_count : int or None
Number of returned words. If None full most important sentences will be returned.
Returns
-------
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Most important sentences.
"""
length = 0
selected_sentences = []
# Loops until the word count is reached.
for sentence in sentences:
words_in_sentence = len(sentence.text.split())
# Checks if the inclusion of the sentence gives a better approximation
# to the word parameter.
if abs(word_count - length - words_in_sentence) > abs(word_count - length):
return selected_sentences
selected_sentences.append(sentence)
length += words_in_sentence
return selected_sentences
def _extract_important_sentences(sentences, corpus, important_docs, word_count):
"""Get most important sentences of the `corpus`.
Parameters
----------
sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Given sentences.
corpus : list of list of (int, int)
Provided corpus.
important_docs : list of list of (int, int)
Most important docs of the corpus.
word_count : int
Number of returned words. If None full most important sentences will be returned.
Returns
-------
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Most important sentences.
"""
important_sentences = _get_important_sentences(sentences, corpus, important_docs)
# If no "word_count" option is provided, the number of sentences is
# reduced by the provided ratio. Else, the ratio is ignored.
return important_sentences \
if word_count is None \
else _get_sentences_with_word_count(important_sentences, word_count)
def _format_results(extracted_sentences, split):
"""Returns `extracted_sentences` in desired format.
Parameters
----------
extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit
Given sentences.
split : bool
If True sentences will be returned as list. Otherwise sentences will be merged and returned as string.
Returns
-------
list of str
If `split` **OR**
str
Formatted result.
"""
if split:
return [sentence.text for sentence in extracted_sentences]
return "\n".join([sentence.text for sentence in extracted_sentences])
def _build_hasheable_corpus(corpus):
"""Hashes and get `corpus`.
Parameters
----------
corpus : list of list of (int, int)
Given corpus.
Returns
-------
list of list of (int, int)
Hashable corpus.
"""
return [tuple(doc) for doc in corpus]
def summarize_corpus(corpus, ratio=0.2):
"""Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_.
Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer`
Note
----
The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary
to make sense.
Parameters
----------
corpus : list of list of (int, int)
Given corpus.
ratio : float, optional
Number between 0 and 1 that determines the proportion of the number of
sentences of the original text to be chosen for the summary, optional.
Returns
-------
list of str
Most important documents of given `corpus` sorted by the document score, highest first.
"""
hashable_corpus = _build_hasheable_corpus(corpus)
# If the corpus is empty, the function ends.
if len(corpus) == 0:
logger.warning("Input corpus is empty.")
return []
# Warns the user if there are too few documents.
if len(corpus) < INPUT_MIN_LENGTH:
logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)
graph = _build_graph(hashable_corpus)
_set_graph_edge_weights(graph)
_remove_unreachable_nodes(graph)
# Cannot calculate eigenvectors if number of unique documents in corpus < 3.
# Warns user to add more text. The function ends.
if len(graph.nodes()) < 3:
logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
return []
pagerank_scores = _pagerank(graph)
hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)
return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize(text, ratio=0.2, word_count=None, split=False):
"""Get a summarized version of the given text.
The output summary will consist of the most representative sentences
and will be returned as a string, divided by newlines.
Note
----
The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH`
sentences for the summary to make sense.
The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner`
module. Note that newlines divide sentences.
Parameters
----------
text : str
Given text.
ratio : float, optional
Number between 0 and 1 that determines the proportion of the number of
sentences of the original text to be chosen for the summary.
word_count : int or None, optional
Determines how many words will the output contain.
If both parameters are provided, the ratio will be ignored.
split : bool, optional
If True, list of sentences will be returned. Otherwise joined
strings will bwe returned.
Returns
-------
list of str
If `split` **OR**
str
Most representative sentences of given the text.
"""
# Gets a list of processed sentences.
sentences = _clean_text_by_sentences(text)
# If no sentence could be identified, the function ends.
if len(sentences) == 0:
logger.warning("Input text is empty.")
return [] if split else u""
# If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
if len(sentences) == 1:
raise ValueError("input must have more than one sentence")
# Warns if the text is too short.
if len(sentences) < INPUT_MIN_LENGTH:
logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH)
corpus = _build_corpus(sentences)
most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)
# If couldn't get important docs, the algorithm ends.
if not most_important_docs:
logger.warning("Couldn't get relevant sentences.")
return [] if split else u""
# Extracts the most important sentences with the selected criterion.
extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)
# Sorts the extracted sentences by apparition order in the original text.
extracted_sentences.sort(key=lambda s: s.index)
return _format_results(extracted_sentences, split)