#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module provides functions for summarizing texts. Summarizing is based on ranks of text sentences using a variation of the TextRank algorithm [1]_. .. [1] Federico Barrios, Federico LĀ“opez, Luis Argerich, Rosita Wachenchauzer (2016). Variations of the Similarity Function of TextRank for Automated Summarization, https://arxiv.org/abs/1602.03606 Data ---- .. data:: INPUT_MIN_LENGTH - Minimal number of sentences in text .. data:: WEIGHT_THRESHOLD - Minimal weight of edge between graph nodes. Smaller weights set to zero. Example ------- >>> from gensim.summarization.summarizer import summarize >>> text = '''Rice Pudding - Poem by Alan Alexander Milne ... What is the matter with Mary Jane? ... She's crying with all her might and main, ... And she won't eat her dinner - rice pudding again - ... What is the matter with Mary Jane? ... What is the matter with Mary Jane? ... I've promised her dolls and a daisy-chain, ... And a book about animals - all in vain - ... What is the matter with Mary Jane? ... What is the matter with Mary Jane? ... She's perfectly well, and she hasn't a pain; ... But, look at her, now she's beginning again! - ... What is the matter with Mary Jane? ... What is the matter with Mary Jane? ... I've promised her sweets and a ride in the train, ... And I've begged her to stop for a bit and explain - ... What is the matter with Mary Jane? ... What is the matter with Mary Jane? ... She's perfectly well and she hasn't a pain, ... And it's lovely rice pudding for dinner again! ... What is the matter with Mary Jane?''' >>> print(summarize(text)) And she won't eat her dinner - rice pudding again - I've promised her dolls and a daisy-chain, I've promised her sweets and a ride in the train, And it's lovely rice pudding for dinner again! """ import logging from gensim.utils import deprecated from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences from gensim.summarization.commons import build_graph as _build_graph from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights from gensim.corpora import Dictionary from math import log10 as _log10 from six.moves import xrange INPUT_MIN_LENGTH = 10 WEIGHT_THRESHOLD = 1.e-3 logger = logging.getLogger(__name__) def _set_graph_edge_weights(graph): """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all weights are fairly small, forces all weights to 1, inplace. Parameters ---------- graph : :class:`~gensim.summarization.graph.Graph` Given graph. """ documents = graph.nodes() weights = _bm25_weights(documents) for i in xrange(len(documents)): for j in xrange(len(documents)): if i == j or weights[i][j] < WEIGHT_THRESHOLD: continue sentence_1 = documents[i] sentence_2 = documents[j] edge_1 = (sentence_1, sentence_2) edge_2 = (sentence_2, sentence_1) if not graph.has_edge(edge_1): graph.add_edge(edge_1, weights[i][j]) if not graph.has_edge(edge_2): graph.add_edge(edge_2, weights[j][i]) # Handles the case in which all similarities are zero. # The resultant summary will consist of random sentences. if all(graph.edge_weight(edge) == 0 for edge in graph.edges()): _create_valid_graph(graph) def _create_valid_graph(graph): """Sets all weights of edges for different edges as 1, inplace. Parameters ---------- graph : :class:`~gensim.summarization.graph.Graph` Given graph. """ nodes = graph.nodes() for i in xrange(len(nodes)): for j in xrange(len(nodes)): if i == j: continue edge = (nodes[i], nodes[j]) if graph.has_edge(edge): graph.del_edge(edge) graph.add_edge(edge, 1) @deprecated("Function will be removed in 4.0.0") def _get_doc_length(doc): """Get length of (tokenized) document. Parameters ---------- doc : list of (list of (tuple of int)) Given document. Returns ------- int Length of document. """ return sum([item[1] for item in doc]) @deprecated("Function will be removed in 4.0.0") def _get_similarity(doc1, doc2, vec1, vec2): """Returns similarity of two documents. Parameters ---------- doc1 : list of (list of (tuple of int)) First document. doc2 : list of (list of (tuple of int)) Second document. vec1 : array ? of first document. vec1 : array ? of secont document. Returns ------- float Similarity of two documents. """ numerator = vec1.dot(vec2.transpose()).toarray()[0][0] length_1 = _get_doc_length(doc1) length_2 = _get_doc_length(doc2) denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0 return numerator / denominator if denominator != 0 else 0 def _build_corpus(sentences): """Construct corpus from provided sentences. Parameters ---------- sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Given sentences. Returns ------- list of list of (int, int) Corpus built from sentences. """ split_tokens = [sentence.token.split() for sentence in sentences] dictionary = Dictionary(split_tokens) return [dictionary.doc2bow(token) for token in split_tokens] def _get_important_sentences(sentences, corpus, important_docs): """Get most important sentences. Parameters ---------- sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Given sentences. corpus : list of list of (int, int) Provided corpus. important_docs : list of list of (int, int) Most important documents of the corpus. Returns ------- list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Most important sentences. """ hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs] def _get_sentences_with_word_count(sentences, word_count): """Get list of sentences. Total number of returned words close to specified `word_count`. Parameters ---------- sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Given sentences. word_count : int or None Number of returned words. If None full most important sentences will be returned. Returns ------- list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Most important sentences. """ length = 0 selected_sentences = [] # Loops until the word count is reached. for sentence in sentences: words_in_sentence = len(sentence.text.split()) # Checks if the inclusion of the sentence gives a better approximation # to the word parameter. if abs(word_count - length - words_in_sentence) > abs(word_count - length): return selected_sentences selected_sentences.append(sentence) length += words_in_sentence return selected_sentences def _extract_important_sentences(sentences, corpus, important_docs, word_count): """Get most important sentences of the `corpus`. Parameters ---------- sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Given sentences. corpus : list of list of (int, int) Provided corpus. important_docs : list of list of (int, int) Most important docs of the corpus. word_count : int Number of returned words. If None full most important sentences will be returned. Returns ------- list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Most important sentences. """ important_sentences = _get_important_sentences(sentences, corpus, important_docs) # If no "word_count" option is provided, the number of sentences is # reduced by the provided ratio. Else, the ratio is ignored. return important_sentences \ if word_count is None \ else _get_sentences_with_word_count(important_sentences, word_count) def _format_results(extracted_sentences, split): """Returns `extracted_sentences` in desired format. Parameters ---------- extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit Given sentences. split : bool If True sentences will be returned as list. Otherwise sentences will be merged and returned as string. Returns ------- list of str If `split` **OR** str Formatted result. """ if split: return [sentence.text for sentence in extracted_sentences] return "\n".join([sentence.text for sentence in extracted_sentences]) def _build_hasheable_corpus(corpus): """Hashes and get `corpus`. Parameters ---------- corpus : list of list of (int, int) Given corpus. Returns ------- list of list of (int, int) Hashable corpus. """ return [tuple(doc) for doc in corpus] def summarize_corpus(corpus, ratio=0.2): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return [] pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]] def summarize(text, ratio=0.2, word_count=None, split=False): """Get a summarized version of the given text. The output summary will consist of the most representative sentences and will be returned as a string, divided by newlines. Note ---- The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` sentences for the summary to make sense. The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner` module. Note that newlines divide sentences. Parameters ---------- text : str Given text. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary. word_count : int or None, optional Determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. split : bool, optional If True, list of sentences will be returned. Otherwise joined strings will bwe returned. Returns ------- list of str If `split` **OR** str Most representative sentences of given the text. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) # If no sentence could be identified, the function ends. if len(sentences) == 0: logger.warning("Input text is empty.") return [] if split else u"" # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH) corpus = _build_corpus(sentences) most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) # If couldn't get important docs, the algorithm ends. if not most_important_docs: logger.warning("Couldn't get relevant sentences.") return [] if split else u"" # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split)