laywerrobot/lib/python3.6/site-packages/nltk/tbl/feature.py

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Transformation-based learning
#
# Copyright (C) 2001-2018 NLTK Project
# Author: Marcus Uneson <marcus.uneson@gmail.com>
#   based on previous (nltk2) version by
#   Christopher Maloof, Edward Loper, Steven Bird
# URL: <http://nltk.org/>
# For license information, see  LICENSE.TXT

from __future__ import division, print_function, unicode_literals
from abc import ABCMeta, abstractmethod
from six import add_metaclass


@add_metaclass(ABCMeta)
class Feature(object):
    """
    An abstract base class for Features. A Feature is a combination of
    a specific property-computing method and a list of relative positions
    to apply that method to.

    The property-computing method, M{extract_property(tokens, index)},
    must be implemented by every subclass. It extracts or computes a specific
    property for the token at the current index. Typical extract_property()
    methods return features such as the token text or tag; but more involved
    methods may consider the entire sequence M{tokens} and
    for instance compute the length of the sentence the token belongs to.

    In addition, the subclass may have a PROPERTY_NAME, which is how
    it will be printed (in Rules and Templates, etc). If not given, defaults
    to the classname.

    """

    json_tag = 'nltk.tbl.Feature'
    PROPERTY_NAME = None

    def __init__(self, positions, end=None):
        """
        Construct a Feature which may apply at C{positions}.

        #For instance, importing some concrete subclasses (Feature is abstract)
        >>> from nltk.tag.brill import Word, Pos

        #Feature Word, applying at one of [-2, -1]
        >>> Word([-2,-1])
        Word([-2, -1])

        #Positions need not be contiguous
        >>> Word([-2,-1, 1])
        Word([-2, -1, 1])

        #Contiguous ranges can alternatively be specified giving the
        #two endpoints (inclusive)
        >>> Pos(-3, -1)
        Pos([-3, -2, -1])

        #In two-arg form, start <= end is enforced
        >>> Pos(2, 1)
        Traceback (most recent call last):
          File "<stdin>", line 1, in <module>
          File "nltk/tbl/template.py", line 306, in __init__
            raise TypeError
        ValueError: illegal interval specification: (start=2, end=1)

        :type positions: list of int
        :param positions: the positions at which this features should apply
        :raises ValueError: illegal position specifications

        An alternative calling convention, for contiguous positions only,
        is Feature(start, end):

        :type start: int
        :param start: start of range where this feature should apply
        :type end: int
        :param end: end of range (NOTE: inclusive!) where this feature should apply

        """
        self.positions = None  # to avoid warnings
        if end is None:
            self.positions = tuple(sorted(set([int(i) for i in positions])))
        else:                # positions was actually not a list, but only the start index
            try:
                if positions > end:
                    raise TypeError
                self.positions = tuple(range(positions, end+1))
            except TypeError:
                # let any kind of erroneous spec raise ValueError
                raise ValueError("illegal interval specification: (start={0}, end={1})".format(positions, end))

        # set property name given in subclass, or otherwise name of subclass
        self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__

    def encode_json_obj(self):
        return self.positions

    @classmethod
    def decode_json_obj(cls, obj):
        positions = obj
        return cls(positions)

    def __repr__(self):
        return "%s(%r)" % (
            self.__class__.__name__, list(self.positions))

    @classmethod
    def expand(cls, starts, winlens, excludezero=False):
        """
        Return a list of features, one for each start point in starts
        and for each window length in winlen. If excludezero is True,
        no Features containing 0 in its positions will be generated
        (many tbl trainers have a special representation for the
        target feature at [0])

        For instance, importing a concrete subclass (Feature is abstract)
        >>> from nltk.tag.brill import Word

        First argument gives the possible start positions, second the
        possible window lengths
        >>> Word.expand([-3,-2,-1], [1])
        [Word([-3]), Word([-2]), Word([-1])]

        >>> Word.expand([-2,-1], [1])
        [Word([-2]), Word([-1])]

        >>> Word.expand([-3,-2,-1], [1,2])
        [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]

        >>> Word.expand([-2,-1], [1])
        [Word([-2]), Word([-1])]

        a third optional argument excludes all Features whose positions contain zero
        >>> Word.expand([-2,-1,0], [1,2], excludezero=False)
        [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]

        >>> Word.expand([-2,-1,0], [1,2], excludezero=True)
        [Word([-2]), Word([-1]), Word([-2, -1])]

        All window lengths must be positive
        >>> Word.expand([-2,-1], [0])
        Traceback (most recent call last):
          File "<stdin>", line 1, in <module>
          File "nltk/tag/tbl/template.py", line 371, in expand
            :param starts: where to start looking for Feature
        ValueError: non-positive window length in [0]

        :param starts: where to start looking for Feature
        :type starts: list of ints
        :param winlens: window lengths where to look for Feature
        :type starts: list of ints
        :param excludezero: do not output any Feature with 0 in any of its positions.
        :type excludezero: bool
        :returns: list of Features
        :raises ValueError: for non-positive window lengths
        """
        if not all(x > 0 for x in winlens):
            raise ValueError("non-positive window length in {0}".format(winlens))
        xs = (starts[i:i+w] for w in winlens for i in range(len(starts)-w+1))
        return [cls(x) for x in xs if not (excludezero and 0 in x)]

    def issuperset(self, other):
        """
        Return True if this Feature always returns True when other does

        More precisely, return True if this feature refers to the same property as other;
        and this Feature looks at all positions that other does (and possibly
        other positions in addition).

        #For instance, importing a concrete subclass (Feature is abstract)
        >>> from nltk.tag.brill import Word, Pos

        >>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
        True

        >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
        False

        #Feature subclasses must agree
        >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
        False

        :param other: feature with which to compare
        :type other: (subclass of) Feature
        :return: True if this feature is superset, otherwise False
        :rtype: bool


        """
        return self.__class__ is other.__class__ and set(self.positions) >= set(other.positions)

    def intersects(self, other):
        """
        Return True if the positions of this Feature intersects with those of other

        More precisely, return True if this feature refers to the same property as other;
        and there is some overlap in the positions they look at.

        #For instance, importing a concrete subclass (Feature is abstract)
        >>> from nltk.tag.brill import Word, Pos

        >>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
        True

        >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
        True

        >>> Word([-3,-2,-1]).intersects(Word([0]))
        False

        #Feature subclasses must agree
        >>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
        False

        :param other: feature with which to compare
        :type other: (subclass of) Feature
        :return: True if feature classes agree and there is some overlap in the positions they look at
        :rtype: bool
        """

        return bool((self.__class__ is other.__class__ and set(self.positions) & set(other.positions)))

    # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
    # it will be enough to define __lt__ and __eq__
    def __eq__(self, other):
        return (self.__class__ is other.__class__ and self.positions == other.positions)

    def __lt__(self, other):
        return (
            self.__class__.__name__ < other.__class__.__name__ or
            #    self.positions is a sorted tuple of ints
            self.positions < other.positions
        )

    def __ne__(self, other):
        return not (self == other)

    def __gt__(self, other):
        return other < self

    def __ge__(self, other):
        return not self < other

    def __le__(self, other):
        return self < other or self == other

    @staticmethod
    @abstractmethod
    def extract_property(tokens, index):
        """
        Any subclass of Feature must define static method extract_property(tokens, index)

        :param tokens: the sequence of tokens
        :type tokens: list of tokens
        :param index: the current index
        :type index: int
        :return: feature value
        :rtype: any (but usually scalar)
        """
first commit 2020-08-27 21:55:39 +02:00			`# -- coding: utf-8 --`
			`# Natural Language Toolkit: Transformation-based learning`
			`#`
			`# Copyright (C) 2001-2018 NLTK Project`
			`# Author: Marcus Uneson <marcus.uneson@gmail.com>`
			`# based on previous (nltk2) version by`
			`# Christopher Maloof, Edward Loper, Steven Bird`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`from __future__ import division, print_function, unicode_literals`
			`from abc import ABCMeta, abstractmethod`
			`from six import add_metaclass`


			`@add_metaclass(ABCMeta)`
			`class Feature(object):`
			`"""`
			`An abstract base class for Features. A Feature is a combination of`
			`a specific property-computing method and a list of relative positions`
			`to apply that method to.`

			`The property-computing method, M{extract_property(tokens, index)},`
			`must be implemented by every subclass. It extracts or computes a specific`
			`property for the token at the current index. Typical extract_property()`
			`methods return features such as the token text or tag; but more involved`
			`methods may consider the entire sequence M{tokens} and`
			`for instance compute the length of the sentence the token belongs to.`

			`In addition, the subclass may have a PROPERTY_NAME, which is how`
			`it will be printed (in Rules and Templates, etc). If not given, defaults`
			`to the classname.`

			`"""`

			`json_tag = 'nltk.tbl.Feature'`
			`PROPERTY_NAME = None`

			`def __init__(self, positions, end=None):`
			`"""`
			`Construct a Feature which may apply at C{positions}.`

			`#For instance, importing some concrete subclasses (Feature is abstract)`
			`>>> from nltk.tag.brill import Word, Pos`

			`#Feature Word, applying at one of [-2, -1]`
			`>>> Word([-2,-1])`
			`Word([-2, -1])`

			`#Positions need not be contiguous`
			`>>> Word([-2,-1, 1])`
			`Word([-2, -1, 1])`

			`#Contiguous ranges can alternatively be specified giving the`
			`#two endpoints (inclusive)`
			`>>> Pos(-3, -1)`
			`Pos([-3, -2, -1])`

			`#In two-arg form, start <= end is enforced`
			`>>> Pos(2, 1)`
			`Traceback (most recent call last):`
			`File "<stdin>", line 1, in <module>`
			`File "nltk/tbl/template.py", line 306, in __init__`
			`raise TypeError`
			`ValueError: illegal interval specification: (start=2, end=1)`

			`:type positions: list of int`
			`:param positions: the positions at which this features should apply`
			`:raises ValueError: illegal position specifications`

			`An alternative calling convention, for contiguous positions only,`
			`is Feature(start, end):`

			`:type start: int`
			`:param start: start of range where this feature should apply`
			`:type end: int`
			`:param end: end of range (NOTE: inclusive!) where this feature should apply`

			`"""`
			`self.positions = None # to avoid warnings`
			`if end is None:`
			`self.positions = tuple(sorted(set([int(i) for i in positions])))`
			`else: # positions was actually not a list, but only the start index`
			`try:`
			`if positions > end:`
			`raise TypeError`
			`self.positions = tuple(range(positions, end+1))`
			`except TypeError:`
			`# let any kind of erroneous spec raise ValueError`
			`raise ValueError("illegal interval specification: (start={0}, end={1})".format(positions, end))`

			`# set property name given in subclass, or otherwise name of subclass`
			`self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__`

			`def encode_json_obj(self):`
			`return self.positions`

			`@classmethod`
			`def decode_json_obj(cls, obj):`
			`positions = obj`
			`return cls(positions)`

			`def __repr__(self):`
			`return "%s(%r)" % (`
			`self.__class__.__name__, list(self.positions))`

			`@classmethod`
			`def expand(cls, starts, winlens, excludezero=False):`
			`"""`
			`Return a list of features, one for each start point in starts`
			`and for each window length in winlen. If excludezero is True,`
			`no Features containing 0 in its positions will be generated`
			`(many tbl trainers have a special representation for the`
			`target feature at [0])`

			`For instance, importing a concrete subclass (Feature is abstract)`
			`>>> from nltk.tag.brill import Word`

			`First argument gives the possible start positions, second the`
			`possible window lengths`
			`>>> Word.expand([-3,-2,-1], [1])`
			`[Word([-3]), Word([-2]), Word([-1])]`

			`>>> Word.expand([-2,-1], [1])`
			`[Word([-2]), Word([-1])]`

			`>>> Word.expand([-3,-2,-1], [1,2])`
			`[Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]`

			`>>> Word.expand([-2,-1], [1])`
			`[Word([-2]), Word([-1])]`

			`a third optional argument excludes all Features whose positions contain zero`
			`>>> Word.expand([-2,-1,0], [1,2], excludezero=False)`
			`[Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]`

			`>>> Word.expand([-2,-1,0], [1,2], excludezero=True)`
			`[Word([-2]), Word([-1]), Word([-2, -1])]`

			`All window lengths must be positive`
			`>>> Word.expand([-2,-1], [0])`
			`Traceback (most recent call last):`
			`File "<stdin>", line 1, in <module>`
			`File "nltk/tag/tbl/template.py", line 371, in expand`
			`:param starts: where to start looking for Feature`
			`ValueError: non-positive window length in [0]`

			`:param starts: where to start looking for Feature`
			`:type starts: list of ints`
			`:param winlens: window lengths where to look for Feature`
			`:type starts: list of ints`
			`:param excludezero: do not output any Feature with 0 in any of its positions.`
			`:type excludezero: bool`
			`:returns: list of Features`
			`:raises ValueError: for non-positive window lengths`
			`"""`
			`if not all(x > 0 for x in winlens):`
			`raise ValueError("non-positive window length in {0}".format(winlens))`
			`xs = (starts[i:i+w] for w in winlens for i in range(len(starts)-w+1))`
			`return [cls(x) for x in xs if not (excludezero and 0 in x)]`

			`def issuperset(self, other):`
			`"""`
			`Return True if this Feature always returns True when other does`

			`More precisely, return True if this feature refers to the same property as other;`
			`and this Feature looks at all positions that other does (and possibly`
			`other positions in addition).`

			`#For instance, importing a concrete subclass (Feature is abstract)`
			`>>> from nltk.tag.brill import Word, Pos`

			`>>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))`
			`True`

			`>>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))`
			`False`

			`#Feature subclasses must agree`
			`>>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))`
			`False`

			`:param other: feature with which to compare`
			`:type other: (subclass of) Feature`
			`:return: True if this feature is superset, otherwise False`
			`:rtype: bool`


			`"""`
			`return self.__class__ is other.__class__ and set(self.positions) >= set(other.positions)`

			`def intersects(self, other):`
			`"""`
			`Return True if the positions of this Feature intersects with those of other`

			`More precisely, return True if this feature refers to the same property as other;`
			`and there is some overlap in the positions they look at.`

			`#For instance, importing a concrete subclass (Feature is abstract)`
			`>>> from nltk.tag.brill import Word, Pos`

			`>>> Word([-3,-2,-1]).intersects(Word([-3,-2]))`
			`True`

			`>>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))`
			`True`

			`>>> Word([-3,-2,-1]).intersects(Word([0]))`
			`False`

			`#Feature subclasses must agree`
			`>>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))`
			`False`

			`:param other: feature with which to compare`
			`:type other: (subclass of) Feature`
			`:return: True if feature classes agree and there is some overlap in the positions they look at`
			`:rtype: bool`
			`"""`

			`return bool((self.__class__ is other.__class__ and set(self.positions) & set(other.positions)))`

			`# Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),`
			`# it will be enough to define __lt__ and __eq__`
			`def __eq__(self, other):`
			`return (self.__class__ is other.__class__ and self.positions == other.positions)`

			`def __lt__(self, other):`
			`return (`
			`self.__class__.__name__ < other.__class__.__name__ or`
			`# self.positions is a sorted tuple of ints`
			`self.positions < other.positions`
			`)`

			`def __ne__(self, other):`
			`return not (self == other)`

			`def __gt__(self, other):`
			`return other < self`

			`def __ge__(self, other):`
			`return not self < other`

			`def __le__(self, other):`
			`return self < other or self == other`

			`@staticmethod`
			`@abstractmethod`
			`def extract_property(tokens, index):`
			`"""`
			`Any subclass of Feature must define static method extract_property(tokens, index)`

			`:param tokens: the sequence of tokens`
			`:type tokens: list of tokens`
			`:param index: the current index`
			`:type index: int`
			`:return: feature value`
			`:rtype: any (but usually scalar)`
			`"""`