laywerrobot/lib/python3.6/site-packages/gensim/summarization/textcleaner.py
2020-08-27 21:55:39 +02:00

311 lines
8.8 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module contains functions and processors used for processing text,
extracting sentences from text, working with acronyms and abbreviations.
Data
----
.. data:: SEPARATOR - Special separator used in abbreviations.
.. data:: RE_SENTENCE - Pattern to split text to sentences.
.. data:: AB_SENIOR - Pattern for detecting abbreviations (example: Sgt. Pepper).
.. data:: AB_ACRONYM - Pattern for detecting acronyms.
.. data:: AB_ACRONYM_LETTERS - Pattern for detecting acronyms (example: P.S. I love you).
.. data:: UNDO_AB_SENIOR - Pattern like AB_SENIOR but with SEPARATOR between abbreviation and next word.
.. data:: UNDO_AB_ACRONYM - Pattern like AB_ACRONYM but with SEPARATOR between abbreviation and next word.
"""
from gensim.summarization.syntactic_unit import SyntacticUnit
from gensim.parsing.preprocessing import preprocess_documents
from gensim.utils import tokenize
from six.moves import xrange
import re
import logging
logger = logging.getLogger('summarizer.preprocessing.cleaner')
try:
from pattern.en import tag
logger.info("'pattern' package found; tag filters are available for English")
HAS_PATTERN = True
except ImportError:
logger.info("'pattern' package not found; tag filters are not available for English")
HAS_PATTERN = False
SEPARATOR = r'@'
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)
UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE)
UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE)
def split_sentences(text):
"""Split and get list of sentences from given text. It preserves abbreviations set in
:const:`~gensim.summarization.textcleaner.AB_SENIOR` and :const:`~gensim.summarization.textcleaner.AB_ACRONYM`.
Parameters
----------
text : str
Input text.
Returns
-------
list of str
Sentences of given text.
Example
-------
>>> from gensim.summarization.textcleaner import split_sentences
>>> text = '''Beautiful is better than ugly.
... Explicit is better than implicit. Simple is better than complex.'''
>>> split_sentences(text)
['Beautiful is better than ugly.',
'Explicit is better than implicit.',
'Simple is better than complex.']
"""
processed = replace_abbreviations(text)
return [undo_replacement(sentence) for sentence in get_sentences(processed)]
def replace_abbreviations(text):
"""Replace blank space to '@' separator after abbreviation and next word.
Parameters
----------
text : str
Input sentence.
Returns
-------
str
Sentence with changed separator.
Example
-------
>>> replace_abbreviations("God bless you, please, Mrs. Robinson")
God bless you, please, Mrs.@Robinson
"""
return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
def undo_replacement(sentence):
"""Replace `@` separator back to blank space after each abbreviation.
Parameters
----------
sentence : str
Input sentence.
Returns
-------
str
Sentence with changed separator.
Example
-------
>>> undo_replacement("God bless you, please, Mrs.@Robinson")
God bless you, please, Mrs. Robinson
"""
return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
def replace_with_separator(text, separator, regexs):
"""Get text with replaced separator if provided regular expressions were matched.
Parameters
----------
text : str
Input text.
separator : str
The separator between words to be replaced.
regexs : list of `_sre.SRE_Pattern`
Regular expressions used in processing text.
Returns
-------
str
Text with replaced separators.
"""
replacement = r"\1" + separator + r"\2"
result = text
for regex in regexs:
result = regex.sub(replacement, result)
return result
def get_sentences(text):
"""Sentence generator from provided text. Sentence pattern set
in :const:`~gensim.summarization.textcleaner.RE_SENTENCE`.
Parameters
----------
text : str
Input text.
Yields
------
str
Single sentence extracted from text.
Example
-------
>>> text = "Does this text contains two sentences? Yes, it does."
>>> for sentence in get_sentences(text):
>>> print(sentence)
Does this text contains two sentences?
Yes, it does.
"""
for match in RE_SENTENCE.finditer(text):
yield match.group()
def merge_syntactic_units(original_units, filtered_units, tags=None):
"""Process given sentences and its filtered (tokenized) copies into
:class:`~gensim.summarization.syntactic_unit.SyntacticUnit`. Also adds tags if they are provided to produced units.
Parameters
----------
original_units : list
List of original sentences.
filtered_units : list
List of tokenized sentences.
tags : list of str, optional
List of strings used as tags for each unit. None as deafault.
Returns
-------
list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit
List of syntactic units (sentences).
"""
units = []
for i in xrange(len(original_units)):
if filtered_units[i] == '':
continue
text = original_units[i]
token = filtered_units[i]
tag = tags[i][1] if tags else None
sentence = SyntacticUnit(text, token, tag)
sentence.index = i
units.append(sentence)
return units
def join_words(words, separator=" "):
"""Concatenates `words` with `separator` between elements.
Parameters
----------
words : list of str
Given words.
separator : str, optional
The separator between elements.
Returns
-------
str
String of merged words with separator between elements.
"""
return separator.join(words)
def clean_text_by_sentences(text):
"""Tokenize a given text into sentences, applying filters and lemmatize them.
Parameters
----------
text : str
Given text.
Returns
-------
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
Sentences of the given text.
"""
original_sentences = split_sentences(text)
filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]
return merge_syntactic_units(original_sentences, filtered_sentences)
def clean_text_by_word(text, deacc=True):
"""Tokenize a given text into words, applying filters and lemmatize them.
Parameters
----------
text : str
Given text.
deacc : bool, optional
Remove accentuation if True.
Returns
-------
dict
Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.
Example
-------
>>> from gensim.summarization.textcleaner import clean_text_by_word
>>> clean_text_by_word("God helps those who help themselves")
{'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}
"""
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
if HAS_PATTERN:
tags = tag(join_words(original_words)) # tag needs the context of the words in the text
else:
tags = None
units = merge_syntactic_units(original_words, filtered_words, tags)
return {unit.text: unit for unit in units}
def tokenize_by_word(text):
"""Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set
:const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`.
Parameters
----------
text : str
Given text.
Returns
-------
generator
Generator that yields sequence words of the given text.
Example
-------
>>> from gensim.summarization.textcleaner import tokenize_by_word
>>> g = tokenize_by_word('Veni. Vedi. Vici.')
>>> print(next(g))
veni
>>> print(next(g))
vedi
>>> print(next(g))
vici
"""
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
return tokenize(text_without_acronyms, to_lower=True, deacc=True)