laywerrobot/lib/python3.6/site-packages/gensim/summarization/summarizer.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module provides functions for summarizing texts. Summarizing is based on
ranks of text sentences using a variation of the TextRank algorithm [1]_.

.. [1] Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016).
       Variations of the Similarity Function of TextRank for Automated Summarization,
       https://arxiv.org/abs/1602.03606


Data
----

.. data:: INPUT_MIN_LENGTH - Minimal number of sentences in text
.. data:: WEIGHT_THRESHOLD - Minimal weight of edge between graph nodes. Smaller weights set to zero.

Example
-------

>>> from gensim.summarization.summarizer import summarize
>>> text = '''Rice Pudding - Poem by Alan Alexander Milne
... What is the matter with Mary Jane?
... She's crying with all her might and main,
... And she won't eat her dinner - rice pudding again -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her dolls and a daisy-chain,
... And a book about animals - all in vain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well, and she hasn't a pain;
... But, look at her, now she's beginning again! -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her sweets and a ride in the train,
... And I've begged her to stop for a bit and explain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well and she hasn't a pain,
... And it's lovely rice pudding for dinner again!
... What is the matter with Mary Jane?'''
>>> print(summarize(text))
And she won't eat her dinner - rice pudding again -
I've promised her dolls and a daisy-chain,
I've promised her sweets and a ride in the train,
And it's lovely rice pudding for dinner again!

"""

import logging
from gensim.utils import deprecated
from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank
from gensim.summarization.textcleaner import clean_text_by_sentences as _clean_text_by_sentences
from gensim.summarization.commons import build_graph as _build_graph
from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights
from gensim.corpora import Dictionary
from math import log10 as _log10
from six.moves import xrange


INPUT_MIN_LENGTH = 10

WEIGHT_THRESHOLD = 1.e-3

logger = logging.getLogger(__name__)


def _set_graph_edge_weights(graph):
    """Sets weights using BM25 algorithm. Leaves small weights as zeroes. If all weights are fairly small,
     forces all weights to 1, inplace.

    Parameters
    ----------
    graph : :class:`~gensim.summarization.graph.Graph`
        Given graph.

    """
    documents = graph.nodes()
    weights = _bm25_weights(documents)

    for i in xrange(len(documents)):
        for j in xrange(len(documents)):
            if i == j or weights[i][j] < WEIGHT_THRESHOLD:
                continue

            sentence_1 = documents[i]
            sentence_2 = documents[j]

            edge_1 = (sentence_1, sentence_2)
            edge_2 = (sentence_2, sentence_1)

            if not graph.has_edge(edge_1):
                graph.add_edge(edge_1, weights[i][j])
            if not graph.has_edge(edge_2):
                graph.add_edge(edge_2, weights[j][i])

    # Handles the case in which all similarities are zero.
    # The resultant summary will consist of random sentences.
    if all(graph.edge_weight(edge) == 0 for edge in graph.edges()):
        _create_valid_graph(graph)


def _create_valid_graph(graph):
    """Sets all weights of edges for different edges as 1, inplace.

    Parameters
    ----------
    graph : :class:`~gensim.summarization.graph.Graph`
        Given graph.

    """
    nodes = graph.nodes()

    for i in xrange(len(nodes)):
        for j in xrange(len(nodes)):
            if i == j:
                continue

            edge = (nodes[i], nodes[j])

            if graph.has_edge(edge):
                graph.del_edge(edge)

            graph.add_edge(edge, 1)


@deprecated("Function will be removed in 4.0.0")
def _get_doc_length(doc):
    """Get length of (tokenized) document.

    Parameters
    ----------
    doc : list of (list of (tuple of int))
        Given document.

    Returns
    -------
    int
        Length of document.

    """
    return sum([item[1] for item in doc])


@deprecated("Function will be removed in 4.0.0")
def _get_similarity(doc1, doc2, vec1, vec2):
    """Returns similarity of two documents.

    Parameters
    ----------
    doc1 : list of (list of (tuple of int))
        First document.
    doc2 : list of (list of (tuple of int))
        Second document.
    vec1 : array
        ? of first document.
    vec1 : array
        ? of secont document.

    Returns
    -------
    float
        Similarity of two documents.

    """
    numerator = vec1.dot(vec2.transpose()).toarray()[0][0]
    length_1 = _get_doc_length(doc1)
    length_2 = _get_doc_length(doc2)

    denominator = _log10(length_1) + _log10(length_2) if length_1 > 0 and length_2 > 0 else 0

    return numerator / denominator if denominator != 0 else 0


def _build_corpus(sentences):
    """Construct corpus from provided sentences.

    Parameters
    ----------
    sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Given sentences.

    Returns
    -------
    list of list of (int, int)
        Corpus built from sentences.

    """
    split_tokens = [sentence.token.split() for sentence in sentences]
    dictionary = Dictionary(split_tokens)
    return [dictionary.doc2bow(token) for token in split_tokens]


def _get_important_sentences(sentences, corpus, important_docs):
    """Get most important sentences.

    Parameters
    ----------
    sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Given sentences.
    corpus : list of list of (int, int)
        Provided corpus.
    important_docs : list of list of (int, int)
        Most important documents of the corpus.

    Returns
    -------
    list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Most important sentences.

    """
    hashable_corpus = _build_hasheable_corpus(corpus)
    sentences_by_corpus = dict(zip(hashable_corpus, sentences))
    return [sentences_by_corpus[tuple(important_doc)] for important_doc in important_docs]


def _get_sentences_with_word_count(sentences, word_count):
    """Get list of sentences. Total number of returned words close to specified `word_count`.

    Parameters
    ----------
    sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Given sentences.
    word_count : int or None
        Number of returned words. If None full most important sentences will be returned.

    Returns
    -------
    list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Most important sentences.

    """
    length = 0
    selected_sentences = []

    # Loops until the word count is reached.
    for sentence in sentences:
        words_in_sentence = len(sentence.text.split())

        # Checks if the inclusion of the sentence gives a better approximation
        # to the word parameter.
        if abs(word_count - length - words_in_sentence) > abs(word_count - length):
            return selected_sentences

        selected_sentences.append(sentence)
        length += words_in_sentence

    return selected_sentences


def _extract_important_sentences(sentences, corpus, important_docs, word_count):
    """Get most important sentences of the `corpus`.

    Parameters
    ----------
    sentences : list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Given sentences.
    corpus : list of list of (int, int)
        Provided corpus.
    important_docs : list of list of (int, int)
        Most important docs of the corpus.
    word_count : int
        Number of returned words. If None full most important sentences will be returned.

    Returns
    -------
    list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Most important sentences.

    """
    important_sentences = _get_important_sentences(sentences, corpus, important_docs)

    # If no "word_count" option is provided, the number of sentences is
    # reduced by the provided ratio. Else, the ratio is ignored.
    return important_sentences \
        if word_count is None \
        else _get_sentences_with_word_count(important_sentences, word_count)


def _format_results(extracted_sentences, split):
    """Returns `extracted_sentences` in desired format.

    Parameters
    ----------
    extracted_sentences : list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit
        Given sentences.
    split : bool
        If True sentences will be returned as list. Otherwise sentences will be merged and returned as string.

    Returns
    -------
    list of str
        If `split` **OR**
    str
        Formatted result.

    """
    if split:
        return [sentence.text for sentence in extracted_sentences]
    return "\n".join([sentence.text for sentence in extracted_sentences])


def _build_hasheable_corpus(corpus):
    """Hashes and get `corpus`.

    Parameters
    ----------
    corpus : list of list of (int, int)
        Given corpus.

    Returns
    -------
    list of list of (int, int)
        Hashable corpus.

    """
    return [tuple(doc) for doc in corpus]


def summarize_corpus(corpus, ratio=0.2):
    """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_.
     Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer`

    Note
    ----
    The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary
    to make sense.


    Parameters
    ----------
    corpus : list of list of (int, int)
        Given corpus.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary, optional.

    Returns
    -------
    list of str
        Most important documents of given `corpus` sorted by the document score, highest first.

    """
    hashable_corpus = _build_hasheable_corpus(corpus)

    # If the corpus is empty, the function ends.
    if len(corpus) == 0:
        logger.warning("Input corpus is empty.")
        return []

    # Warns the user if there are too few documents.
    if len(corpus) < INPUT_MIN_LENGTH:
        logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)

    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)

    # Cannot calculate eigenvectors if number of unique documents in corpus < 3.
    # Warns user to add more text. The function ends.
    if len(graph.nodes()) < 3:
        logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
        return []

    pagerank_scores = _pagerank(graph)

    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]


def summarize(text, ratio=0.2, word_count=None, split=False):
    """Get a summarized version of the given text.

    The output summary will consist of the most representative sentences
    and will be returned as a string, divided by newlines.

    Note
    ----
    The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH`
    sentences for the summary to make sense.
    The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner`
    module. Note that newlines divide sentences.


    Parameters
    ----------
    text : str
        Given text.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary.
    word_count : int or None, optional
        Determines how many words will the output contain.
        If both parameters are provided, the ratio will be ignored.
    split : bool, optional
        If True, list of sentences will be returned. Otherwise joined
        strings will bwe returned.

    Returns
    -------
    list of str
        If `split` **OR**
    str
        Most representative sentences of given the text.

    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    # If no sentence could be identified, the function ends.
    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return [] if split else u""

    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
    if len(sentences) == 1:
        raise ValueError("input must have more than one sentence")

    # Warns if the text is too short.
    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH)

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)

    # If couldn't get important docs, the algorithm ends.
    if not most_important_docs:
        logger.warning("Couldn't get relevant sentences.")
        return [] if split else u""

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)