laywerrobot/lib/python3.6/site-packages/gensim/summarization/keywords.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module contains functions to find keywords of the text and building graph on tokens from text.

Examples
--------
Extract keywords from text

>>> from gensim.summarization import keywords
>>> text='''Challenges in natural language processing frequently involve
... speech recognition, natural language understanding, natural language
... generation (frequently from formal, machine-readable logical forms),
... connecting language and machine perception, dialog systems, or some
... combination thereof.'''
>>> keywords(text).split('\\n')
[u'natural language', u'machine', u'frequently']


Notes
-----
Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
for `INCLUDING_FILTER` and `EXCLUDING_FILTER`

Data:
-----
.. data:: WINDOW_SIZE - Size of window, number of consecutive tokens in processing.
.. data:: INCLUDING_FILTER - Including part of speech filters.
.. data:: EXCLUDING_FILTER - Excluding part of speech filters.

"""

from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank
from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word
from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word
from gensim.summarization.commons import build_graph as _build_graph
from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
from gensim.utils import to_unicode
from itertools import combinations as _combinations
from six.moves.queue import Queue as _Queue
from six.moves import xrange
from six import iteritems


WINDOW_SIZE = 2

INCLUDING_FILTER = ['NN', 'JJ']
EXCLUDING_FILTER = []


def _get_pos_filters():
    """Get default including and excluding filters as frozen sets.

    Returns
    -------
    (frozenset of str, frozenset of str)
        Including and excluding filters.

    """
    return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)


def _get_words_for_graph(tokens, pos_filter=None):
    """Filters given dictionary of tokens using provided part of speech filters.

    Parameters
    ----------
    tokens : dict
        Original units (words) as keys and processed units (tokens) as values.
    pos_filter : iterable
        Part of speech filters, optional. If `None` - using :func:`_get_pos_filters`.

    Returns
    -------
    list of str
        Filtered tokens.

    Raises
    ------
    ValueError
        If include and exclude filters ar not empty at the same time.

    """
    if pos_filter is None:
        include_filters, exclude_filters = _get_pos_filters()
    else:
        include_filters = set(pos_filter)
        exclude_filters = frozenset([])
    if include_filters and exclude_filters:
        raise ValueError("Can't use both include and exclude filters, should use only one")

    result = []
    for word, unit in iteritems(tokens):
        if exclude_filters and unit.tag in exclude_filters:
            continue
        if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
            result.append(unit.token)
    return result


def _get_first_window(split_text):
    """Get first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from given `split_text`.

    Parameters
    ----------
    split_text : list of str
        Splitted text.

    Returns
    -------
    list of str
        First :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens.

    """
    return split_text[:WINDOW_SIZE]


def _set_graph_edge(graph, tokens, word_a, word_b):
    """Sets an edge between nodes named word_a and word_b if they exists in `tokens` and `graph`, inplace.

    Parameters
    ----------
    graph : :class:~gensim.summarization.graph.Graph
        Given graph.
    tokens : dict
        Original units (words) as keys and processed units (tokens) as values.
    word_a : str
        First word, name of first node.
    word_b : str
        Second word, name of second node.

    """
    if word_a in tokens and word_b in tokens:
        lemma_a = tokens[word_a].token
        lemma_b = tokens[word_b].token
        edge = (lemma_a, lemma_b)

        if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
            graph.add_edge(edge)


def _process_first_window(graph, tokens, split_text):
    """Sets an edges between nodes taken from first :const:`~gensim.parsing.keywords.WINDOW_SIZE`
    words of `split_text` if they exist in `tokens` and `graph`, inplace.

    Parameters
    ----------
    graph : :class:~gensim.summarization.graph.Graph
        Given graph.
    tokens : dict
        Original units (words) as keys and processed units (tokens) as values.
    split_text : list of str
        Splitted text.

    """
    first_window = _get_first_window(split_text)
    for word_a, word_b in _combinations(first_window, 2):
        _set_graph_edge(graph, tokens, word_a, word_b)


def _init_queue(split_text):
    """Initialize queue by first words from `split_text`.

    Parameters
    ----------
    split_text : list of str
        Splitted text.

    Returns
    -------
    Queue
        Initialized queue.

    """
    queue = _Queue()
    first_window = _get_first_window(split_text)
    for word in first_window[1:]:
        queue.put(word)
    return queue


def _process_word(graph, tokens, queue, word):
    """Sets edge between `word` and each element in queue in `graph` if such nodes
    exist in `tokens` and `graph`.

    Parameters
    ----------
    graph : :class:`~gensim.summarization.graph.Graph`
        Given graph.
    tokens : dict
        Original units (words) as keys and processed units (tokens) as values.
    queue : Queue
        Given queue.
    word : str
        Word, possible `node` in graph and item in `tokens`.

    """
    for word_to_compare in _queue_iterator(queue):
        _set_graph_edge(graph, tokens, word, word_to_compare)


def _update_queue(queue, word):
    """Updates given `queue` (removes last item and puts `word`).

    Parameters
    ----------
    queue : Queue
        Given queue.
    word : str
        Word to be added to queue.

    """
    queue.get()
    queue.put(word)
    assert queue.qsize() == (WINDOW_SIZE - 1)


def _process_text(graph, tokens, split_text):
    """Process `split_text` by updating given `graph` with new eges between nodes
    if they exists in `tokens` and `graph`.
    Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`.

    Parameters
    ----------
    graph : :class:`~gensim.summarization.graph.Graph`
        Given graph.
    tokens : dict
        Original units (words) as keys and processed units (tokens) as values.
    split_text : list of str
        Splitted text.

    """
    queue = _init_queue(split_text)
    for i in xrange(WINDOW_SIZE, len(split_text)):
        word = split_text[i]
        _process_word(graph, tokens, queue, word)
        _update_queue(queue, word)


def _queue_iterator(queue):
    """Represents iterator of the given queue.

    Parameters
    ----------
    queue : Queue
        Given queue.

    Yields
    ------
    str
        Current item of queue.

    """
    iterations = queue.qsize()
    for _ in xrange(iterations):
        var = queue.get()
        yield var
        queue.put(var)


def _set_graph_edges(graph, tokens, split_text):
    """Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`.
    Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`.

    Parameters
    ----------
    graph : :class:~gensim.summarization.graph.Graph
        Given graph.
    tokens : dict
        Original units (words) as keys and processed units (tokens) as values.
    split_text : list of str
        Splitted text.

    """
    _process_first_window(graph, tokens, split_text)
    _process_text(graph, tokens, split_text)


def _extract_tokens(lemmas, scores, ratio, words):
    """Extracts tokens from provided lemmas. Most scored lemmas are used if `words` not provided.

    Parameters
    ----------
    lemmas : list of str
        Given lemmas.
    scores : dict
        Dictionary with lemmas and its scores.
    ratio : float
        Proportion of lemmas used for final result.
    words : int
        Number of used words. If no "words" option is selected, the number of
        sentences is reduced by the provided ratio, else, the ratio is ignored.

    Returns
    -------
    list of (float, str)
        Scores and corresponded lemmas.

    """
    lemmas.sort(key=lambda s: scores[s], reverse=True)
    length = len(lemmas) * ratio if words is None else words
    return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))]


def _lemmas_to_words(tokens):
    """Get words and lemmas from given tokens. Produces "reversed" `tokens`.

    Parameters
    ----------
    tokens : dict
        Original units (words) as keys and processed units (tokens) as values.

    Returns
    -------
    dict
        Lemmas as keys and lists corresponding words as values.

    """
    lemma_to_word = {}
    for word, unit in iteritems(tokens):
        lemma = unit.token
        if lemma in lemma_to_word:
            lemma_to_word[lemma].append(word)
        else:
            lemma_to_word[lemma] = [word]
    return lemma_to_word


def _get_keywords_with_score(extracted_lemmas, lemma_to_word):
    """Get words of `extracted_lemmas` and its scores, words contains in `lemma_to_word`.

    Parameters
    ----------
    extracted_lemmas : list of (float, str)
        Given lemmas with scores
    lemma_to_word : dict
        Lemmas and corresponding words.

    Returns
    -------
    dict
        Keywords as keys and its scores as values.

    """

    keywords = {}
    for score, lemma in extracted_lemmas:
        keyword_list = lemma_to_word[lemma]
        for keyword in keyword_list:
            keywords[keyword] = score
    return keywords


def _strip_word(word):
    """Get cleaned `word`.

    Parameters
    ----------
    word : str
        Given word.

    Returns
    -------
    str
        Cleaned word.
    """
    stripped_word_list = list(_tokenize_by_word(word))
    return stripped_word_list[0] if stripped_word_list else ""


def _get_combined_keywords(_keywords, split_text):
    """Get most scored words (`_keywords`) contained in `split_text` and it's combinations.

    Parameters
    ----------
    _keywords : dict
        Keywords as keys and its scores as values.
    split_text : list of str
        Splitted text.

    Returns
    -------
    list of str
        Keywords and/or its combinations.

    """
    result = []
    _keywords = _keywords.copy()
    len_text = len(split_text)
    for i in xrange(len_text):
        word = _strip_word(split_text[i])
        if word in _keywords:
            combined_word = [word]
            if i + 1 == len_text:
                result.append(word)   # appends last word if keyword and doesn't iterate
            for j in xrange(i + 1, len_text):
                other_word = _strip_word(split_text[j])
                if other_word in _keywords and other_word == split_text[j] and other_word not in combined_word:
                    combined_word.append(other_word)
                else:
                    for keyword in combined_word:
                        _keywords.pop(keyword)
                    result.append(" ".join(combined_word))
                    break
    return result


def _get_average_score(concept, _keywords):
    """Get average score of words in `concept`.

    Parameters
    ----------
    concept : str
        Input text.
    _keywords : dict
        Keywords as keys and its scores as values.

    Returns
    -------
    float
        Average score.

    """
    word_list = concept.split()
    word_counter = 0
    total = 0
    for word in word_list:
        total += _keywords[word]
        word_counter += 1
    return total / word_counter


def _format_results(_keywords, combined_keywords, split, scores):
    """Formats, sorts and returns `combined_keywords` in desired format.

    Parameters
    ----------
    _keywords : dict
        Keywords as keys and its scores as values.
    combined_keywords : list of str
        Most ranked words and/or its combinations.
    split : bool
        Split result if True or return string otherwise, optional.
    scores : bool
        Whether return `combined_keywords` with scores, optional. If True
        `split` is ignored.

    Returns
    -------
    result: list of (str, float)
        If `scores`, keywords with scores **OR**
    result: list of str
        If `split`, keywords only **OR**
    result: str
        Keywords, joined by endl.

    """
    combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
    if scores:
        return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
    if split:
        return combined_keywords
    return "\n".join(combined_keywords)


def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'),
             lemmatize=False, deacc=True):
    """Get most ranked words of provided text and/or its combinations.

    Parameters
    ----------

    text : str
        Input text.
    ratio : float, optional
        If no "words" option is selected, the number of sentences is reduced by the provided ratio,
        else, the ratio is ignored.
    words : int, optional
        Number of returned words.
    split : bool, optional
        Whether split keywords if True.
    scores : bool, optional
        Whether score of keyword.
    pos_filter : tuple, optional
        Part of speech filters.
    lemmatize : bool, optional
        If True - lemmatize words.
    deacc : bool, optional
        If True - remove accentuation.

    Returns
    -------
    result: list of (str, float)
        If `scores`, keywords with scores **OR**
    result: list of str
        If `split`, keywords only **OR**
    result: str
        Keywords, joined by endl.

    """
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)


def get_graph(text):
    """Creates and returns graph from given text, cleans and tokenize text before building graph.

    Parameters
    ----------
    text : str
        Sequence of values.

    Returns
    -------
    :class:`~gensim.summarization.graph.Graph`
        Created graph.

    """
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph
first commit 2020-08-27 21:55:39 +02:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`#`
			`# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html`

			`"""This module contains functions to find keywords of the text and building graph on tokens from text.`

			`Examples`
			`--------`
			`Extract keywords from text`

			`>>> from gensim.summarization import keywords`
			`>>> text='''Challenges in natural language processing frequently involve`
			`... speech recognition, natural language understanding, natural language`
			`... generation (frequently from formal, machine-readable logical forms),`
			`... connecting language and machine perception, dialog systems, or some`
			`... combination thereof.'''`
			`>>> keywords(text).split('\\n')`
			`[u'natural language', u'machine', u'frequently']`


			`Notes`
			`-----`
			`Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters`
			for `INCLUDING_FILTER` and `EXCLUDING_FILTER`

			`Data:`
			`-----`
			`.. data:: WINDOW_SIZE - Size of window, number of consecutive tokens in processing.`
			`.. data:: INCLUDING_FILTER - Including part of speech filters.`
			`.. data:: EXCLUDING_FILTER - Excluding part of speech filters.`

			`"""`

			`from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank`
			`from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word`
			`from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word`
			`from gensim.summarization.commons import build_graph as _build_graph`
			`from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes`
			`from gensim.utils import to_unicode`
			`from itertools import combinations as _combinations`
			`from six.moves.queue import Queue as _Queue`
			`from six.moves import xrange`
			`from six import iteritems`


			`WINDOW_SIZE = 2`

			`INCLUDING_FILTER = ['NN', 'JJ']`
			`EXCLUDING_FILTER = []`


			`def _get_pos_filters():`
			`"""Get default including and excluding filters as frozen sets.`

			`Returns`
			`-------`
			`(frozenset of str, frozenset of str)`
			`Including and excluding filters.`

			`"""`
			`return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)`


			`def _get_words_for_graph(tokens, pos_filter=None):`
			`"""Filters given dictionary of tokens using provided part of speech filters.`

			`Parameters`
			`----------`
			`tokens : dict`
			`Original units (words) as keys and processed units (tokens) as values.`
			`pos_filter : iterable`
			Part of speech filters, optional. If `None` - using :func:`_get_pos_filters`.

			`Returns`
			`-------`
			`list of str`
			`Filtered tokens.`

			`Raises`
			`------`
			`ValueError`
			`If include and exclude filters ar not empty at the same time.`

			`"""`
			`if pos_filter is None:`
			`include_filters, exclude_filters = _get_pos_filters()`
			`else:`
			`include_filters = set(pos_filter)`
			`exclude_filters = frozenset([])`
			`if include_filters and exclude_filters:`
			`raise ValueError("Can't use both include and exclude filters, should use only one")`

			`result = []`
			`for word, unit in iteritems(tokens):`
			`if exclude_filters and unit.tag in exclude_filters:`
			`continue`
			`if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:`
			`result.append(unit.token)`
			`return result`


			`def _get_first_window(split_text):`
			"""Get first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from given `split_text`.

			`Parameters`
			`----------`
			`split_text : list of str`
			`Splitted text.`

			`Returns`
			`-------`
			`list of str`
			First :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens.

			`"""`
			`return split_text[:WINDOW_SIZE]`


			`def _set_graph_edge(graph, tokens, word_a, word_b):`
			"""Sets an edge between nodes named word_a and word_b if they exists in `tokens` and `graph`, inplace.

			`Parameters`
			`----------`
			`graph : :class:~gensim.summarization.graph.Graph`
			`Given graph.`
			`tokens : dict`
			`Original units (words) as keys and processed units (tokens) as values.`
			`word_a : str`
			`First word, name of first node.`
			`word_b : str`
			`Second word, name of second node.`

			`"""`
			`if word_a in tokens and word_b in tokens:`
			`lemma_a = tokens[word_a].token`
			`lemma_b = tokens[word_b].token`
			`edge = (lemma_a, lemma_b)`

			`if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):`
			`graph.add_edge(edge)`


			`def _process_first_window(graph, tokens, split_text):`
			"""Sets an edges between nodes taken from first :const:`~gensim.parsing.keywords.WINDOW_SIZE`
			words of `split_text` if they exist in `tokens` and `graph`, inplace.

			`Parameters`
			`----------`
			`graph : :class:~gensim.summarization.graph.Graph`
			`Given graph.`
			`tokens : dict`
			`Original units (words) as keys and processed units (tokens) as values.`
			`split_text : list of str`
			`Splitted text.`

			`"""`
			`first_window = _get_first_window(split_text)`
			`for word_a, word_b in _combinations(first_window, 2):`
			`_set_graph_edge(graph, tokens, word_a, word_b)`


			`def _init_queue(split_text):`
			"""Initialize queue by first words from `split_text`.

			`Parameters`
			`----------`
			`split_text : list of str`
			`Splitted text.`

			`Returns`
			`-------`
			`Queue`
			`Initialized queue.`

			`"""`
			`queue = _Queue()`
			`first_window = _get_first_window(split_text)`
			`for word in first_window[1:]:`
			`queue.put(word)`
			`return queue`


			`def _process_word(graph, tokens, queue, word):`
			"""Sets edge between `word` and each element in queue in `graph` if such nodes
			exist in `tokens` and `graph`.

			`Parameters`
			`----------`
			graph : :class:`~gensim.summarization.graph.Graph`
			`Given graph.`
			`tokens : dict`
			`Original units (words) as keys and processed units (tokens) as values.`
			`queue : Queue`
			`Given queue.`
			`word : str`
			Word, possible `node` in graph and item in `tokens`.

			`"""`
			`for word_to_compare in _queue_iterator(queue):`
			`_set_graph_edge(graph, tokens, word, word_to_compare)`


			`def _update_queue(queue, word):`
			"""Updates given `queue` (removes last item and puts `word`).

			`Parameters`
			`----------`
			`queue : Queue`
			`Given queue.`
			`word : str`
			`Word to be added to queue.`

			`"""`
			`queue.get()`
			`queue.put(word)`
			`assert queue.qsize() == (WINDOW_SIZE - 1)`


			`def _process_text(graph, tokens, split_text):`
			"""Process `split_text` by updating given `graph` with new eges between nodes
			if they exists in `tokens` and `graph`.
			Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`.

			`Parameters`
			`----------`
			graph : :class:`~gensim.summarization.graph.Graph`
			`Given graph.`
			`tokens : dict`
			`Original units (words) as keys and processed units (tokens) as values.`
			`split_text : list of str`
			`Splitted text.`

			`"""`
			`queue = _init_queue(split_text)`
			`for i in xrange(WINDOW_SIZE, len(split_text)):`
			`word = split_text[i]`
			`_process_word(graph, tokens, queue, word)`
			`_update_queue(queue, word)`


			`def _queue_iterator(queue):`
			`"""Represents iterator of the given queue.`

			`Parameters`
			`----------`
			`queue : Queue`
			`Given queue.`

			`Yields`
			`------`
			`str`
			`Current item of queue.`

			`"""`
			`iterations = queue.qsize()`
			`for _ in xrange(iterations):`
			`var = queue.get()`
			`yield var`
			`queue.put(var)`


			`def _set_graph_edges(graph, tokens, split_text):`
			"""Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`.
			Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`.

			`Parameters`
			`----------`
			`graph : :class:~gensim.summarization.graph.Graph`
			`Given graph.`
			`tokens : dict`
			`Original units (words) as keys and processed units (tokens) as values.`
			`split_text : list of str`
			`Splitted text.`

			`"""`
			`_process_first_window(graph, tokens, split_text)`
			`_process_text(graph, tokens, split_text)`


			`def _extract_tokens(lemmas, scores, ratio, words):`
			"""Extracts tokens from provided lemmas. Most scored lemmas are used if `words` not provided.

			`Parameters`
			`----------`
			`lemmas : list of str`
			`Given lemmas.`
			`scores : dict`
			`Dictionary with lemmas and its scores.`
			`ratio : float`
			`Proportion of lemmas used for final result.`
			`words : int`
			`Number of used words. If no "words" option is selected, the number of`
			`sentences is reduced by the provided ratio, else, the ratio is ignored.`

			`Returns`
			`-------`
			`list of (float, str)`
			`Scores and corresponded lemmas.`

			`"""`
			`lemmas.sort(key=lambda s: scores[s], reverse=True)`
			`length = len(lemmas) * ratio if words is None else words`
			`return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))]`


			`def _lemmas_to_words(tokens):`
			"""Get words and lemmas from given tokens. Produces "reversed" `tokens`.

			`Parameters`
			`----------`
			`tokens : dict`
			`Original units (words) as keys and processed units (tokens) as values.`

			`Returns`
			`-------`
			`dict`
			`Lemmas as keys and lists corresponding words as values.`

			`"""`
			`lemma_to_word = {}`
			`for word, unit in iteritems(tokens):`
			`lemma = unit.token`
			`if lemma in lemma_to_word:`
			`lemma_to_word[lemma].append(word)`
			`else:`
			`lemma_to_word[lemma] = [word]`
			`return lemma_to_word`


			`def _get_keywords_with_score(extracted_lemmas, lemma_to_word):`
			"""Get words of `extracted_lemmas` and its scores, words contains in `lemma_to_word`.

			`Parameters`
			`----------`
			`extracted_lemmas : list of (float, str)`
			`Given lemmas with scores`
			`lemma_to_word : dict`
			`Lemmas and corresponding words.`

			`Returns`
			`-------`
			`dict`
			`Keywords as keys and its scores as values.`

			`"""`

			`keywords = {}`
			`for score, lemma in extracted_lemmas:`
			`keyword_list = lemma_to_word[lemma]`
			`for keyword in keyword_list:`
			`keywords[keyword] = score`
			`return keywords`


			`def _strip_word(word):`
			"""Get cleaned `word`.

			`Parameters`
			`----------`
			`word : str`
			`Given word.`

			`Returns`
			`-------`
			`str`
			`Cleaned word.`
			`"""`
			`stripped_word_list = list(_tokenize_by_word(word))`
			`return stripped_word_list[0] if stripped_word_list else ""`


			`def _get_combined_keywords(_keywords, split_text):`
			"""Get most scored words (`_keywords`) contained in `split_text` and it's combinations.

			`Parameters`
			`----------`
			`_keywords : dict`
			`Keywords as keys and its scores as values.`
			`split_text : list of str`
			`Splitted text.`

			`Returns`
			`-------`
			`list of str`
			`Keywords and/or its combinations.`

			`"""`
			`result = []`
			`_keywords = _keywords.copy()`
			`len_text = len(split_text)`
			`for i in xrange(len_text):`
			`word = _strip_word(split_text[i])`
			`if word in _keywords:`
			`combined_word = [word]`
			`if i + 1 == len_text:`
			`result.append(word) # appends last word if keyword and doesn't iterate`
			`for j in xrange(i + 1, len_text):`
			`other_word = _strip_word(split_text[j])`
			`if other_word in _keywords and other_word == split_text[j] and other_word not in combined_word:`
			`combined_word.append(other_word)`
			`else:`
			`for keyword in combined_word:`
			`_keywords.pop(keyword)`
			`result.append(" ".join(combined_word))`
			`break`
			`return result`


			`def _get_average_score(concept, _keywords):`
			"""Get average score of words in `concept`.

			`Parameters`
			`----------`
			`concept : str`
			`Input text.`
			`_keywords : dict`
			`Keywords as keys and its scores as values.`

			`Returns`
			`-------`
			`float`
			`Average score.`

			`"""`
			`word_list = concept.split()`
			`word_counter = 0`
			`total = 0`
			`for word in word_list:`
			`total += _keywords[word]`
			`word_counter += 1`
			`return total / word_counter`


			`def _format_results(_keywords, combined_keywords, split, scores):`
			"""Formats, sorts and returns `combined_keywords` in desired format.

			`Parameters`
			`----------`
			`_keywords : dict`
			`Keywords as keys and its scores as values.`
			`combined_keywords : list of str`
			`Most ranked words and/or its combinations.`
			`split : bool`
			`Split result if True or return string otherwise, optional.`
			`scores : bool`
			Whether return `combined_keywords` with scores, optional. If True
			`split` is ignored.

			`Returns`
			`-------`
			`result: list of (str, float)`
			If `scores`, keywords with scores OR
			`result: list of str`
			If `split`, keywords only OR
			`result: str`
			`Keywords, joined by endl.`

			`"""`
			`combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)`
			`if scores:`
			`return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]`
			`if split:`
			`return combined_keywords`
			`return "\n".join(combined_keywords)`


			`def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'),`
			`lemmatize=False, deacc=True):`
			`"""Get most ranked words of provided text and/or its combinations.`

			`Parameters`
			`----------`

			`text : str`
			`Input text.`
			`ratio : float, optional`
			`If no "words" option is selected, the number of sentences is reduced by the provided ratio,`
			`else, the ratio is ignored.`
			`words : int, optional`
			`Number of returned words.`
			`split : bool, optional`
			`Whether split keywords if True.`
			`scores : bool, optional`
			`Whether score of keyword.`
			`pos_filter : tuple, optional`
			`Part of speech filters.`
			`lemmatize : bool, optional`
			`If True - lemmatize words.`
			`deacc : bool, optional`
			`If True - remove accentuation.`

			`Returns`
			`-------`
			`result: list of (str, float)`
			If `scores`, keywords with scores OR
			`result: list of str`
			If `split`, keywords only OR
			`result: str`
			`Keywords, joined by endl.`

			`"""`
			`# Gets a dict of word -> lemma`
			`text = to_unicode(text)`
			`tokens = _clean_text_by_word(text, deacc=deacc)`
			`split_text = list(_tokenize_by_word(text))`

			`# Creates the graph and adds the edges`
			`graph = _build_graph(_get_words_for_graph(tokens, pos_filter))`
			`_set_graph_edges(graph, tokens, split_text)`
			`del split_text # It's no longer used`

			`_remove_unreachable_nodes(graph)`

			`# Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score`
			`pagerank_scores = _pagerank(graph)`

			`extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)`

			`# The results can be polluted by many variations of the same word`
			`if lemmatize:`
			`lemmas_to_word = {}`
			`for word, unit in iteritems(tokens):`
			`lemmas_to_word[unit.token] = [word]`
			`else:`
			`lemmas_to_word = _lemmas_to_words(tokens)`

			`keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)`

			`# text.split() to keep numbers and punctuation marks, so separeted concepts are not combined`
			`combined_keywords = _get_combined_keywords(keywords, text.split())`

			`return _format_results(keywords, combined_keywords, split, scores)`


			`def get_graph(text):`
			`"""Creates and returns graph from given text, cleans and tokenize text before building graph.`

			`Parameters`
			`----------`
			`text : str`
			`Sequence of values.`

			`Returns`
			`-------`
			:class:`~gensim.summarization.graph.Graph`
			`Created graph.`

			`"""`
			`tokens = _clean_text_by_word(text)`
			`split_text = list(_tokenize_by_word(text))`

			`graph = _build_graph(_get_words_for_graph(tokens))`
			`_set_graph_edges(graph, tokens, split_text)`

			`return graph`