54 lines
1.3 KiB
Python
54 lines
1.3 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
|
||
|
"""This module contains implementation of SyntacticUnit class. It generally used while text cleaning.
|
||
|
:class:`~gensim.summarization.syntactic_unit.SyntacticUnit` represents printable version of provided text.
|
||
|
|
||
|
"""
|
||
|
|
||
|
|
||
|
class SyntacticUnit(object):
|
||
|
"""SyntacticUnit class.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
text : str
|
||
|
Input text.
|
||
|
token : str
|
||
|
Tokenized text.
|
||
|
tag : str
|
||
|
Tag of unit, optional.
|
||
|
index : int
|
||
|
Index of sytactic unit in corpus, optional.
|
||
|
score : float
|
||
|
Score of synctatic unit, optional.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, text, token=None, tag=None):
|
||
|
"""
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Input text.
|
||
|
token : str
|
||
|
Tokenized text, optional.
|
||
|
tag : str
|
||
|
Tag of unit, optional.
|
||
|
|
||
|
"""
|
||
|
self.text = text
|
||
|
self.token = token
|
||
|
self.tag = tag[:2] if tag else None # Just first two letters of tag
|
||
|
self.index = -1
|
||
|
self.score = -1
|
||
|
|
||
|
def __str__(self):
|
||
|
return "Original unit: '" + self.text + "' *-*-*-* " + "Processed unit: '" + self.token + "'"
|
||
|
|
||
|
def __repr__(self):
|
||
|
return str(self)
|