312 lines
8.8 KiB
Python
312 lines
8.8 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
|
||
|
"""This module contains functions and processors used for processing text,
|
||
|
extracting sentences from text, working with acronyms and abbreviations.
|
||
|
|
||
|
Data
|
||
|
----
|
||
|
|
||
|
.. data:: SEPARATOR - Special separator used in abbreviations.
|
||
|
.. data:: RE_SENTENCE - Pattern to split text to sentences.
|
||
|
.. data:: AB_SENIOR - Pattern for detecting abbreviations (example: Sgt. Pepper).
|
||
|
.. data:: AB_ACRONYM - Pattern for detecting acronyms.
|
||
|
.. data:: AB_ACRONYM_LETTERS - Pattern for detecting acronyms (example: P.S. I love you).
|
||
|
.. data:: UNDO_AB_SENIOR - Pattern like AB_SENIOR but with SEPARATOR between abbreviation and next word.
|
||
|
.. data:: UNDO_AB_ACRONYM - Pattern like AB_ACRONYM but with SEPARATOR between abbreviation and next word.
|
||
|
|
||
|
"""
|
||
|
|
||
|
|
||
|
from gensim.summarization.syntactic_unit import SyntacticUnit
|
||
|
from gensim.parsing.preprocessing import preprocess_documents
|
||
|
from gensim.utils import tokenize
|
||
|
from six.moves import xrange
|
||
|
import re
|
||
|
import logging
|
||
|
|
||
|
logger = logging.getLogger('summarizer.preprocessing.cleaner')
|
||
|
|
||
|
try:
|
||
|
from pattern.en import tag
|
||
|
logger.info("'pattern' package found; tag filters are available for English")
|
||
|
HAS_PATTERN = True
|
||
|
except ImportError:
|
||
|
logger.info("'pattern' package not found; tag filters are not available for English")
|
||
|
HAS_PATTERN = False
|
||
|
|
||
|
|
||
|
SEPARATOR = r'@'
|
||
|
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
|
||
|
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
|
||
|
AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
|
||
|
AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)
|
||
|
UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + SEPARATOR + r'(\w)', re.UNICODE)
|
||
|
UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + SEPARATOR + r'(\w)', re.UNICODE)
|
||
|
|
||
|
|
||
|
def split_sentences(text):
|
||
|
"""Split and get list of sentences from given text. It preserves abbreviations set in
|
||
|
:const:`~gensim.summarization.textcleaner.AB_SENIOR` and :const:`~gensim.summarization.textcleaner.AB_ACRONYM`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Input text.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Sentences of given text.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> from gensim.summarization.textcleaner import split_sentences
|
||
|
>>> text = '''Beautiful is better than ugly.
|
||
|
... Explicit is better than implicit. Simple is better than complex.'''
|
||
|
>>> split_sentences(text)
|
||
|
['Beautiful is better than ugly.',
|
||
|
'Explicit is better than implicit.',
|
||
|
'Simple is better than complex.']
|
||
|
|
||
|
"""
|
||
|
processed = replace_abbreviations(text)
|
||
|
return [undo_replacement(sentence) for sentence in get_sentences(processed)]
|
||
|
|
||
|
|
||
|
def replace_abbreviations(text):
|
||
|
"""Replace blank space to '@' separator after abbreviation and next word.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Input sentence.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Sentence with changed separator.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> replace_abbreviations("God bless you, please, Mrs. Robinson")
|
||
|
God bless you, please, Mrs.@Robinson
|
||
|
|
||
|
"""
|
||
|
return replace_with_separator(text, SEPARATOR, [AB_SENIOR, AB_ACRONYM])
|
||
|
|
||
|
|
||
|
def undo_replacement(sentence):
|
||
|
"""Replace `@` separator back to blank space after each abbreviation.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sentence : str
|
||
|
Input sentence.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Sentence with changed separator.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> undo_replacement("God bless you, please, Mrs.@Robinson")
|
||
|
God bless you, please, Mrs. Robinson
|
||
|
|
||
|
"""
|
||
|
return replace_with_separator(sentence, r" ", [UNDO_AB_SENIOR, UNDO_AB_ACRONYM])
|
||
|
|
||
|
|
||
|
def replace_with_separator(text, separator, regexs):
|
||
|
"""Get text with replaced separator if provided regular expressions were matched.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Input text.
|
||
|
separator : str
|
||
|
The separator between words to be replaced.
|
||
|
regexs : list of `_sre.SRE_Pattern`
|
||
|
Regular expressions used in processing text.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Text with replaced separators.
|
||
|
|
||
|
"""
|
||
|
replacement = r"\1" + separator + r"\2"
|
||
|
result = text
|
||
|
for regex in regexs:
|
||
|
result = regex.sub(replacement, result)
|
||
|
return result
|
||
|
|
||
|
|
||
|
def get_sentences(text):
|
||
|
"""Sentence generator from provided text. Sentence pattern set
|
||
|
in :const:`~gensim.summarization.textcleaner.RE_SENTENCE`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Input text.
|
||
|
|
||
|
Yields
|
||
|
------
|
||
|
str
|
||
|
Single sentence extracted from text.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> text = "Does this text contains two sentences? Yes, it does."
|
||
|
>>> for sentence in get_sentences(text):
|
||
|
>>> print(sentence)
|
||
|
Does this text contains two sentences?
|
||
|
Yes, it does.
|
||
|
|
||
|
"""
|
||
|
for match in RE_SENTENCE.finditer(text):
|
||
|
yield match.group()
|
||
|
|
||
|
|
||
|
def merge_syntactic_units(original_units, filtered_units, tags=None):
|
||
|
"""Process given sentences and its filtered (tokenized) copies into
|
||
|
:class:`~gensim.summarization.syntactic_unit.SyntacticUnit`. Also adds tags if they are provided to produced units.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
original_units : list
|
||
|
List of original sentences.
|
||
|
filtered_units : list
|
||
|
List of tokenized sentences.
|
||
|
tags : list of str, optional
|
||
|
List of strings used as tags for each unit. None as deafault.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of :class:~gensim.summarization.syntactic_unit.SyntacticUnit
|
||
|
List of syntactic units (sentences).
|
||
|
|
||
|
"""
|
||
|
units = []
|
||
|
for i in xrange(len(original_units)):
|
||
|
if filtered_units[i] == '':
|
||
|
continue
|
||
|
|
||
|
text = original_units[i]
|
||
|
token = filtered_units[i]
|
||
|
tag = tags[i][1] if tags else None
|
||
|
sentence = SyntacticUnit(text, token, tag)
|
||
|
sentence.index = i
|
||
|
|
||
|
units.append(sentence)
|
||
|
|
||
|
return units
|
||
|
|
||
|
|
||
|
def join_words(words, separator=" "):
|
||
|
"""Concatenates `words` with `separator` between elements.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
words : list of str
|
||
|
Given words.
|
||
|
separator : str, optional
|
||
|
The separator between elements.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
String of merged words with separator between elements.
|
||
|
|
||
|
"""
|
||
|
return separator.join(words)
|
||
|
|
||
|
|
||
|
def clean_text_by_sentences(text):
|
||
|
"""Tokenize a given text into sentences, applying filters and lemmatize them.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Given text.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
|
||
|
Sentences of the given text.
|
||
|
|
||
|
"""
|
||
|
original_sentences = split_sentences(text)
|
||
|
filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]
|
||
|
|
||
|
return merge_syntactic_units(original_sentences, filtered_sentences)
|
||
|
|
||
|
|
||
|
def clean_text_by_word(text, deacc=True):
|
||
|
"""Tokenize a given text into words, applying filters and lemmatize them.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Given text.
|
||
|
deacc : bool, optional
|
||
|
Remove accentuation if True.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dict
|
||
|
Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> from gensim.summarization.textcleaner import clean_text_by_word
|
||
|
>>> clean_text_by_word("God helps those who help themselves")
|
||
|
{'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
|
||
|
'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
|
||
|
'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}
|
||
|
|
||
|
"""
|
||
|
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
|
||
|
original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
|
||
|
filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
|
||
|
if HAS_PATTERN:
|
||
|
tags = tag(join_words(original_words)) # tag needs the context of the words in the text
|
||
|
else:
|
||
|
tags = None
|
||
|
units = merge_syntactic_units(original_words, filtered_words, tags)
|
||
|
return {unit.text: unit for unit in units}
|
||
|
|
||
|
|
||
|
def tokenize_by_word(text):
|
||
|
"""Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set
|
||
|
:const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Given text.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
generator
|
||
|
Generator that yields sequence words of the given text.
|
||
|
|
||
|
Example
|
||
|
-------
|
||
|
>>> from gensim.summarization.textcleaner import tokenize_by_word
|
||
|
>>> g = tokenize_by_word('Veni. Vedi. Vici.')
|
||
|
>>> print(next(g))
|
||
|
veni
|
||
|
>>> print(next(g))
|
||
|
vedi
|
||
|
>>> print(next(g))
|
||
|
vici
|
||
|
|
||
|
"""
|
||
|
text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
|
||
|
return tokenize(text_without_acronyms, to_lower=True, deacc=True)
|