# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2018 NLTK Project # Author: Marcus Uneson <marcus.uneson@gmail.com> # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: <http://nltk.org/> # For license information, see LICENSE.TXT from __future__ import division, print_function, unicode_literals from abc import ABCMeta, abstractmethod from six import add_metaclass @add_metaclass(ABCMeta) class Feature(object): """ An abstract base class for Features. A Feature is a combination of a specific property-computing method and a list of relative positions to apply that method to. The property-computing method, M{extract_property(tokens, index)}, must be implemented by every subclass. It extracts or computes a specific property for the token at the current index. Typical extract_property() methods return features such as the token text or tag; but more involved methods may consider the entire sequence M{tokens} and for instance compute the length of the sentence the token belongs to. In addition, the subclass may have a PROPERTY_NAME, which is how it will be printed (in Rules and Templates, etc). If not given, defaults to the classname. """ json_tag = 'nltk.tbl.Feature' PROPERTY_NAME = None def __init__(self, positions, end=None): """ Construct a Feature which may apply at C{positions}. #For instance, importing some concrete subclasses (Feature is abstract) >>> from nltk.tag.brill import Word, Pos #Feature Word, applying at one of [-2, -1] >>> Word([-2,-1]) Word([-2, -1]) #Positions need not be contiguous >>> Word([-2,-1, 1]) Word([-2, -1, 1]) #Contiguous ranges can alternatively be specified giving the #two endpoints (inclusive) >>> Pos(-3, -1) Pos([-3, -2, -1]) #In two-arg form, start <= end is enforced >>> Pos(2, 1) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "nltk/tbl/template.py", line 306, in __init__ raise TypeError ValueError: illegal interval specification: (start=2, end=1) :type positions: list of int :param positions: the positions at which this features should apply :raises ValueError: illegal position specifications An alternative calling convention, for contiguous positions only, is Feature(start, end): :type start: int :param start: start of range where this feature should apply :type end: int :param end: end of range (NOTE: inclusive!) where this feature should apply """ self.positions = None # to avoid warnings if end is None: self.positions = tuple(sorted(set([int(i) for i in positions]))) else: # positions was actually not a list, but only the start index try: if positions > end: raise TypeError self.positions = tuple(range(positions, end+1)) except TypeError: # let any kind of erroneous spec raise ValueError raise ValueError("illegal interval specification: (start={0}, end={1})".format(positions, end)) # set property name given in subclass, or otherwise name of subclass self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__ def encode_json_obj(self): return self.positions @classmethod def decode_json_obj(cls, obj): positions = obj return cls(positions) def __repr__(self): return "%s(%r)" % ( self.__class__.__name__, list(self.positions)) @classmethod def expand(cls, starts, winlens, excludezero=False): """ Return a list of features, one for each start point in starts and for each window length in winlen. If excludezero is True, no Features containing 0 in its positions will be generated (many tbl trainers have a special representation for the target feature at [0]) For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word First argument gives the possible start positions, second the possible window lengths >>> Word.expand([-3,-2,-1], [1]) [Word([-3]), Word([-2]), Word([-1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] >>> Word.expand([-3,-2,-1], [1,2]) [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] a third optional argument excludes all Features whose positions contain zero >>> Word.expand([-2,-1,0], [1,2], excludezero=False) [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] >>> Word.expand([-2,-1,0], [1,2], excludezero=True) [Word([-2]), Word([-1]), Word([-2, -1])] All window lengths must be positive >>> Word.expand([-2,-1], [0]) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "nltk/tag/tbl/template.py", line 371, in expand :param starts: where to start looking for Feature ValueError: non-positive window length in [0] :param starts: where to start looking for Feature :type starts: list of ints :param winlens: window lengths where to look for Feature :type starts: list of ints :param excludezero: do not output any Feature with 0 in any of its positions. :type excludezero: bool :returns: list of Features :raises ValueError: for non-positive window lengths """ if not all(x > 0 for x in winlens): raise ValueError("non-positive window length in {0}".format(winlens)) xs = (starts[i:i+w] for w in winlens for i in range(len(starts)-w+1)) return [cls(x) for x in xs if not (excludezero and 0 in x)] def issuperset(self, other): """ Return True if this Feature always returns True when other does More precisely, return True if this feature refers to the same property as other; and this Feature looks at all positions that other does (and possibly other positions in addition). #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).issuperset(Word([-3,-2])) True >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if this feature is superset, otherwise False :rtype: bool """ return self.__class__ is other.__class__ and set(self.positions) >= set(other.positions) def intersects(self, other): """ Return True if the positions of this Feature intersects with those of other More precisely, return True if this feature refers to the same property as other; and there is some overlap in the positions they look at. #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).intersects(Word([-3,-2])) True >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0])) True >>> Word([-3,-2,-1]).intersects(Word([0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).intersects(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if feature classes agree and there is some overlap in the positions they look at :rtype: bool """ return bool((self.__class__ is other.__class__ and set(self.positions) & set(other.positions))) # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+), # it will be enough to define __lt__ and __eq__ def __eq__(self, other): return (self.__class__ is other.__class__ and self.positions == other.positions) def __lt__(self, other): return ( self.__class__.__name__ < other.__class__.__name__ or # self.positions is a sorted tuple of ints self.positions < other.positions ) def __ne__(self, other): return not (self == other) def __gt__(self, other): return other < self def __ge__(self, other): return not self < other def __le__(self, other): return self < other or self == other @staticmethod @abstractmethod def extract_property(tokens, index): """ Any subclass of Feature must define static method extract_property(tokens, index) :param tokens: the sequence of tokens :type tokens: list of tokens :param index: the current index :type index: int :return: feature value :rtype: any (but usually scalar) """