557 lines
16 KiB
Python
557 lines
16 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
|
||
|
"""This module contains functions to find keywords of the text and building graph on tokens from text.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Extract keywords from text
|
||
|
|
||
|
>>> from gensim.summarization import keywords
|
||
|
>>> text='''Challenges in natural language processing frequently involve
|
||
|
... speech recognition, natural language understanding, natural language
|
||
|
... generation (frequently from formal, machine-readable logical forms),
|
||
|
... connecting language and machine perception, dialog systems, or some
|
||
|
... combination thereof.'''
|
||
|
>>> keywords(text).split('\\n')
|
||
|
[u'natural language', u'machine', u'frequently']
|
||
|
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
|
||
|
for `INCLUDING_FILTER` and `EXCLUDING_FILTER`
|
||
|
|
||
|
Data:
|
||
|
-----
|
||
|
.. data:: WINDOW_SIZE - Size of window, number of consecutive tokens in processing.
|
||
|
.. data:: INCLUDING_FILTER - Including part of speech filters.
|
||
|
.. data:: EXCLUDING_FILTER - Excluding part of speech filters.
|
||
|
|
||
|
"""
|
||
|
|
||
|
from gensim.summarization.pagerank_weighted import pagerank_weighted as _pagerank
|
||
|
from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word
|
||
|
from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word
|
||
|
from gensim.summarization.commons import build_graph as _build_graph
|
||
|
from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes
|
||
|
from gensim.utils import to_unicode
|
||
|
from itertools import combinations as _combinations
|
||
|
from six.moves.queue import Queue as _Queue
|
||
|
from six.moves import xrange
|
||
|
from six import iteritems
|
||
|
|
||
|
|
||
|
WINDOW_SIZE = 2
|
||
|
|
||
|
INCLUDING_FILTER = ['NN', 'JJ']
|
||
|
EXCLUDING_FILTER = []
|
||
|
|
||
|
|
||
|
def _get_pos_filters():
|
||
|
"""Get default including and excluding filters as frozen sets.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
(frozenset of str, frozenset of str)
|
||
|
Including and excluding filters.
|
||
|
|
||
|
"""
|
||
|
return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)
|
||
|
|
||
|
|
||
|
def _get_words_for_graph(tokens, pos_filter=None):
|
||
|
"""Filters given dictionary of tokens using provided part of speech filters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
tokens : dict
|
||
|
Original units (words) as keys and processed units (tokens) as values.
|
||
|
pos_filter : iterable
|
||
|
Part of speech filters, optional. If `None` - using :func:`_get_pos_filters`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Filtered tokens.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError
|
||
|
If include and exclude filters ar not empty at the same time.
|
||
|
|
||
|
"""
|
||
|
if pos_filter is None:
|
||
|
include_filters, exclude_filters = _get_pos_filters()
|
||
|
else:
|
||
|
include_filters = set(pos_filter)
|
||
|
exclude_filters = frozenset([])
|
||
|
if include_filters and exclude_filters:
|
||
|
raise ValueError("Can't use both include and exclude filters, should use only one")
|
||
|
|
||
|
result = []
|
||
|
for word, unit in iteritems(tokens):
|
||
|
if exclude_filters and unit.tag in exclude_filters:
|
||
|
continue
|
||
|
if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
|
||
|
result.append(unit.token)
|
||
|
return result
|
||
|
|
||
|
|
||
|
def _get_first_window(split_text):
|
||
|
"""Get first :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens from given `split_text`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
split_text : list of str
|
||
|
Splitted text.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
First :const:`~gensim.parsing.keywords.WINDOW_SIZE` tokens.
|
||
|
|
||
|
"""
|
||
|
return split_text[:WINDOW_SIZE]
|
||
|
|
||
|
|
||
|
def _set_graph_edge(graph, tokens, word_a, word_b):
|
||
|
"""Sets an edge between nodes named word_a and word_b if they exists in `tokens` and `graph`, inplace.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
graph : :class:~gensim.summarization.graph.Graph
|
||
|
Given graph.
|
||
|
tokens : dict
|
||
|
Original units (words) as keys and processed units (tokens) as values.
|
||
|
word_a : str
|
||
|
First word, name of first node.
|
||
|
word_b : str
|
||
|
Second word, name of second node.
|
||
|
|
||
|
"""
|
||
|
if word_a in tokens and word_b in tokens:
|
||
|
lemma_a = tokens[word_a].token
|
||
|
lemma_b = tokens[word_b].token
|
||
|
edge = (lemma_a, lemma_b)
|
||
|
|
||
|
if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
|
||
|
graph.add_edge(edge)
|
||
|
|
||
|
|
||
|
def _process_first_window(graph, tokens, split_text):
|
||
|
"""Sets an edges between nodes taken from first :const:`~gensim.parsing.keywords.WINDOW_SIZE`
|
||
|
words of `split_text` if they exist in `tokens` and `graph`, inplace.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
graph : :class:~gensim.summarization.graph.Graph
|
||
|
Given graph.
|
||
|
tokens : dict
|
||
|
Original units (words) as keys and processed units (tokens) as values.
|
||
|
split_text : list of str
|
||
|
Splitted text.
|
||
|
|
||
|
"""
|
||
|
first_window = _get_first_window(split_text)
|
||
|
for word_a, word_b in _combinations(first_window, 2):
|
||
|
_set_graph_edge(graph, tokens, word_a, word_b)
|
||
|
|
||
|
|
||
|
def _init_queue(split_text):
|
||
|
"""Initialize queue by first words from `split_text`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
split_text : list of str
|
||
|
Splitted text.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Queue
|
||
|
Initialized queue.
|
||
|
|
||
|
"""
|
||
|
queue = _Queue()
|
||
|
first_window = _get_first_window(split_text)
|
||
|
for word in first_window[1:]:
|
||
|
queue.put(word)
|
||
|
return queue
|
||
|
|
||
|
|
||
|
def _process_word(graph, tokens, queue, word):
|
||
|
"""Sets edge between `word` and each element in queue in `graph` if such nodes
|
||
|
exist in `tokens` and `graph`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
graph : :class:`~gensim.summarization.graph.Graph`
|
||
|
Given graph.
|
||
|
tokens : dict
|
||
|
Original units (words) as keys and processed units (tokens) as values.
|
||
|
queue : Queue
|
||
|
Given queue.
|
||
|
word : str
|
||
|
Word, possible `node` in graph and item in `tokens`.
|
||
|
|
||
|
"""
|
||
|
for word_to_compare in _queue_iterator(queue):
|
||
|
_set_graph_edge(graph, tokens, word, word_to_compare)
|
||
|
|
||
|
|
||
|
def _update_queue(queue, word):
|
||
|
"""Updates given `queue` (removes last item and puts `word`).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
queue : Queue
|
||
|
Given queue.
|
||
|
word : str
|
||
|
Word to be added to queue.
|
||
|
|
||
|
"""
|
||
|
queue.get()
|
||
|
queue.put(word)
|
||
|
assert queue.qsize() == (WINDOW_SIZE - 1)
|
||
|
|
||
|
|
||
|
def _process_text(graph, tokens, split_text):
|
||
|
"""Process `split_text` by updating given `graph` with new eges between nodes
|
||
|
if they exists in `tokens` and `graph`.
|
||
|
Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
graph : :class:`~gensim.summarization.graph.Graph`
|
||
|
Given graph.
|
||
|
tokens : dict
|
||
|
Original units (words) as keys and processed units (tokens) as values.
|
||
|
split_text : list of str
|
||
|
Splitted text.
|
||
|
|
||
|
"""
|
||
|
queue = _init_queue(split_text)
|
||
|
for i in xrange(WINDOW_SIZE, len(split_text)):
|
||
|
word = split_text[i]
|
||
|
_process_word(graph, tokens, queue, word)
|
||
|
_update_queue(queue, word)
|
||
|
|
||
|
|
||
|
def _queue_iterator(queue):
|
||
|
"""Represents iterator of the given queue.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
queue : Queue
|
||
|
Given queue.
|
||
|
|
||
|
Yields
|
||
|
------
|
||
|
str
|
||
|
Current item of queue.
|
||
|
|
||
|
"""
|
||
|
iterations = queue.qsize()
|
||
|
for _ in xrange(iterations):
|
||
|
var = queue.get()
|
||
|
yield var
|
||
|
queue.put(var)
|
||
|
|
||
|
|
||
|
def _set_graph_edges(graph, tokens, split_text):
|
||
|
"""Updates given `graph` by setting eges between nodes if they exists in `tokens` and `graph`.
|
||
|
Words are taken from `split_text` with window size :const:`~gensim.parsing.keywords.WINDOW_SIZE`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
graph : :class:~gensim.summarization.graph.Graph
|
||
|
Given graph.
|
||
|
tokens : dict
|
||
|
Original units (words) as keys and processed units (tokens) as values.
|
||
|
split_text : list of str
|
||
|
Splitted text.
|
||
|
|
||
|
"""
|
||
|
_process_first_window(graph, tokens, split_text)
|
||
|
_process_text(graph, tokens, split_text)
|
||
|
|
||
|
|
||
|
def _extract_tokens(lemmas, scores, ratio, words):
|
||
|
"""Extracts tokens from provided lemmas. Most scored lemmas are used if `words` not provided.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
lemmas : list of str
|
||
|
Given lemmas.
|
||
|
scores : dict
|
||
|
Dictionary with lemmas and its scores.
|
||
|
ratio : float
|
||
|
Proportion of lemmas used for final result.
|
||
|
words : int
|
||
|
Number of used words. If no "words" option is selected, the number of
|
||
|
sentences is reduced by the provided ratio, else, the ratio is ignored.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of (float, str)
|
||
|
Scores and corresponded lemmas.
|
||
|
|
||
|
"""
|
||
|
lemmas.sort(key=lambda s: scores[s], reverse=True)
|
||
|
length = len(lemmas) * ratio if words is None else words
|
||
|
return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))]
|
||
|
|
||
|
|
||
|
def _lemmas_to_words(tokens):
|
||
|
"""Get words and lemmas from given tokens. Produces "reversed" `tokens`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
tokens : dict
|
||
|
Original units (words) as keys and processed units (tokens) as values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dict
|
||
|
Lemmas as keys and lists corresponding words as values.
|
||
|
|
||
|
"""
|
||
|
lemma_to_word = {}
|
||
|
for word, unit in iteritems(tokens):
|
||
|
lemma = unit.token
|
||
|
if lemma in lemma_to_word:
|
||
|
lemma_to_word[lemma].append(word)
|
||
|
else:
|
||
|
lemma_to_word[lemma] = [word]
|
||
|
return lemma_to_word
|
||
|
|
||
|
|
||
|
def _get_keywords_with_score(extracted_lemmas, lemma_to_word):
|
||
|
"""Get words of `extracted_lemmas` and its scores, words contains in `lemma_to_word`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
extracted_lemmas : list of (float, str)
|
||
|
Given lemmas with scores
|
||
|
lemma_to_word : dict
|
||
|
Lemmas and corresponding words.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dict
|
||
|
Keywords as keys and its scores as values.
|
||
|
|
||
|
"""
|
||
|
|
||
|
keywords = {}
|
||
|
for score, lemma in extracted_lemmas:
|
||
|
keyword_list = lemma_to_word[lemma]
|
||
|
for keyword in keyword_list:
|
||
|
keywords[keyword] = score
|
||
|
return keywords
|
||
|
|
||
|
|
||
|
def _strip_word(word):
|
||
|
"""Get cleaned `word`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
word : str
|
||
|
Given word.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Cleaned word.
|
||
|
"""
|
||
|
stripped_word_list = list(_tokenize_by_word(word))
|
||
|
return stripped_word_list[0] if stripped_word_list else ""
|
||
|
|
||
|
|
||
|
def _get_combined_keywords(_keywords, split_text):
|
||
|
"""Get most scored words (`_keywords`) contained in `split_text` and it's combinations.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
_keywords : dict
|
||
|
Keywords as keys and its scores as values.
|
||
|
split_text : list of str
|
||
|
Splitted text.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Keywords and/or its combinations.
|
||
|
|
||
|
"""
|
||
|
result = []
|
||
|
_keywords = _keywords.copy()
|
||
|
len_text = len(split_text)
|
||
|
for i in xrange(len_text):
|
||
|
word = _strip_word(split_text[i])
|
||
|
if word in _keywords:
|
||
|
combined_word = [word]
|
||
|
if i + 1 == len_text:
|
||
|
result.append(word) # appends last word if keyword and doesn't iterate
|
||
|
for j in xrange(i + 1, len_text):
|
||
|
other_word = _strip_word(split_text[j])
|
||
|
if other_word in _keywords and other_word == split_text[j] and other_word not in combined_word:
|
||
|
combined_word.append(other_word)
|
||
|
else:
|
||
|
for keyword in combined_word:
|
||
|
_keywords.pop(keyword)
|
||
|
result.append(" ".join(combined_word))
|
||
|
break
|
||
|
return result
|
||
|
|
||
|
|
||
|
def _get_average_score(concept, _keywords):
|
||
|
"""Get average score of words in `concept`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
concept : str
|
||
|
Input text.
|
||
|
_keywords : dict
|
||
|
Keywords as keys and its scores as values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
Average score.
|
||
|
|
||
|
"""
|
||
|
word_list = concept.split()
|
||
|
word_counter = 0
|
||
|
total = 0
|
||
|
for word in word_list:
|
||
|
total += _keywords[word]
|
||
|
word_counter += 1
|
||
|
return total / word_counter
|
||
|
|
||
|
|
||
|
def _format_results(_keywords, combined_keywords, split, scores):
|
||
|
"""Formats, sorts and returns `combined_keywords` in desired format.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
_keywords : dict
|
||
|
Keywords as keys and its scores as values.
|
||
|
combined_keywords : list of str
|
||
|
Most ranked words and/or its combinations.
|
||
|
split : bool
|
||
|
Split result if True or return string otherwise, optional.
|
||
|
scores : bool
|
||
|
Whether return `combined_keywords` with scores, optional. If True
|
||
|
`split` is ignored.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
result: list of (str, float)
|
||
|
If `scores`, keywords with scores **OR**
|
||
|
result: list of str
|
||
|
If `split`, keywords only **OR**
|
||
|
result: str
|
||
|
Keywords, joined by endl.
|
||
|
|
||
|
"""
|
||
|
combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
|
||
|
if scores:
|
||
|
return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
|
||
|
if split:
|
||
|
return combined_keywords
|
||
|
return "\n".join(combined_keywords)
|
||
|
|
||
|
|
||
|
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'),
|
||
|
lemmatize=False, deacc=True):
|
||
|
"""Get most ranked words of provided text and/or its combinations.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
text : str
|
||
|
Input text.
|
||
|
ratio : float, optional
|
||
|
If no "words" option is selected, the number of sentences is reduced by the provided ratio,
|
||
|
else, the ratio is ignored.
|
||
|
words : int, optional
|
||
|
Number of returned words.
|
||
|
split : bool, optional
|
||
|
Whether split keywords if True.
|
||
|
scores : bool, optional
|
||
|
Whether score of keyword.
|
||
|
pos_filter : tuple, optional
|
||
|
Part of speech filters.
|
||
|
lemmatize : bool, optional
|
||
|
If True - lemmatize words.
|
||
|
deacc : bool, optional
|
||
|
If True - remove accentuation.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
result: list of (str, float)
|
||
|
If `scores`, keywords with scores **OR**
|
||
|
result: list of str
|
||
|
If `split`, keywords only **OR**
|
||
|
result: str
|
||
|
Keywords, joined by endl.
|
||
|
|
||
|
"""
|
||
|
# Gets a dict of word -> lemma
|
||
|
text = to_unicode(text)
|
||
|
tokens = _clean_text_by_word(text, deacc=deacc)
|
||
|
split_text = list(_tokenize_by_word(text))
|
||
|
|
||
|
# Creates the graph and adds the edges
|
||
|
graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
|
||
|
_set_graph_edges(graph, tokens, split_text)
|
||
|
del split_text # It's no longer used
|
||
|
|
||
|
_remove_unreachable_nodes(graph)
|
||
|
|
||
|
# Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
|
||
|
pagerank_scores = _pagerank(graph)
|
||
|
|
||
|
extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
|
||
|
|
||
|
# The results can be polluted by many variations of the same word
|
||
|
if lemmatize:
|
||
|
lemmas_to_word = {}
|
||
|
for word, unit in iteritems(tokens):
|
||
|
lemmas_to_word[unit.token] = [word]
|
||
|
else:
|
||
|
lemmas_to_word = _lemmas_to_words(tokens)
|
||
|
|
||
|
keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)
|
||
|
|
||
|
# text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
|
||
|
combined_keywords = _get_combined_keywords(keywords, text.split())
|
||
|
|
||
|
return _format_results(keywords, combined_keywords, split, scores)
|
||
|
|
||
|
|
||
|
def get_graph(text):
|
||
|
"""Creates and returns graph from given text, cleans and tokenize text before building graph.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Sequence of values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`~gensim.summarization.graph.Graph`
|
||
|
Created graph.
|
||
|
|
||
|
"""
|
||
|
tokens = _clean_text_by_word(text)
|
||
|
split_text = list(_tokenize_by_word(text))
|
||
|
|
||
|
graph = _build_graph(_get_words_for_graph(tokens))
|
||
|
_set_graph_edges(graph, tokens, split_text)
|
||
|
|
||
|
return graph
|