laywerrobot/lib/python3.6/site-packages/nltk/stem/rslp.py

# -*- coding: utf-8 -*-

# Natural Language Toolkit: RSLP Stemmer
#
# Copyright (C) 2001-2018 NLTK Project
# Author: Tiago Tresoldi <tresoldi@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

# This code is based on the algorithm presented in the paper "A Stemming
# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
# Christian Huyck, which unfortunately I had no access to. The code is a
# Python version, with some minor modifications of mine, to the description
# presented at http://www.webcitation.org/5NnvdIzOb and to the C source code
# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
# Please note that this stemmer is intended for demonstration and educational
# purposes only. Feel free to write me for any comments, including the
# development of a different and/or better stemmer for Portuguese. I also
# suggest using NLTK's mailing list for Portuguese for any discussion.

# Este código é baseado no algoritmo apresentado no artigo "A Stemming
# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
# código é uma conversão para Python, com algumas pequenas modificações
# minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do
# código para linguagem C disponível em
# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
# comentário, inclusive sobre o desenvolvimento de um stemmer diferente
# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
# do NLTK para o português para qualquer debate.
from __future__ import print_function, unicode_literals
from nltk.data import load

from nltk.stem.api import StemmerI

class RSLPStemmer(StemmerI):
    """
    A stemmer for Portuguese.

        >>> from nltk.stem import RSLPStemmer
        >>> st = RSLPStemmer()
        >>> # opening lines of Erico Verissimo's "Música ao Longe"
        >>> text = '''
        ... Clarissa risca com giz no quadro-negro a paisagem que os alunos
        ... devem copiar . Uma casinha de porta e janela , em cima duma
        ... coxilha .'''
        >>> for token in text.split():
        ...     print(st.stem(token))
        clariss risc com giz no quadro-negr a pais que os alun dev copi .
        uma cas de port e janel , em cim dum coxilh .
    """

    def __init__ (self):
        self._model = []

        self._model.append( self.read_rule("step0.pt") )
        self._model.append( self.read_rule("step1.pt") )
        self._model.append( self.read_rule("step2.pt") )
        self._model.append( self.read_rule("step3.pt") )
        self._model.append( self.read_rule("step4.pt") )
        self._model.append( self.read_rule("step5.pt") )
        self._model.append( self.read_rule("step6.pt") )

    def read_rule (self, filename):
        rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
        lines = rules.split("\n")

        lines = [line for line in lines if line != ""]     # remove blank lines
        lines = [line for line in lines if line[0] != "#"]  # remove comments

        # NOTE: a simple but ugly hack to make this parser happy with double '\t's
        lines = [line.replace("\t\t", "\t") for line in lines]

        # parse rules
        rules = []
        for line in lines:
            rule = []
            tokens = line.split("\t")

            # text to be searched for at the end of the string
            rule.append( tokens[0][1:-1] ) # remove quotes

            # minimum stem size to perform the replacement
            rule.append( int(tokens[1]) )

            # text to be replaced into
            rule.append( tokens[2][1:-1] ) # remove quotes

            # exceptions to this rule
            rule.append( [token[1:-1] for token in tokens[3].split(",")] )

            # append to the results
            rules.append(rule)

        return rules

    def stem(self, word):
        word = word.lower()

        # the word ends in 's'? apply rule for plural reduction
        if word[-1] == "s":
            word = self.apply_rule(word, 0)

        # the word ends in 'a'? apply rule for feminine reduction
        if word[-1] == "a":
            word = self.apply_rule(word, 1)

        # augmentative reduction
        word = self.apply_rule(word, 3)

        # adverb reduction
        word = self.apply_rule(word, 2)

        # noun reduction
        prev_word = word
        word = self.apply_rule(word, 4)
        if word == prev_word:
            # verb reduction
            prev_word = word
            word = self.apply_rule(word, 5)
            if word == prev_word:
                # vowel removal
                word = self.apply_rule(word, 6)

        return word

    def apply_rule(self, word, rule_index):
        rules = self._model[rule_index]
        for rule in rules:
            suffix_length = len(rule[0])
            if word[-suffix_length:] == rule[0]:       # if suffix matches
                if len(word) >= suffix_length + rule[1]: # if we have minimum size
                    if word not in rule[3]:                # if not an exception
                        word = word[:-suffix_length] + rule[2]
                        break

        return word
first commit 2020-08-27 21:55:39 +02:00			`# -- coding: utf-8 --`

			`# Natural Language Toolkit: RSLP Stemmer`
			`#`
			`# Copyright (C) 2001-2018 NLTK Project`
			`# Author: Tiago Tresoldi <tresoldi@gmail.com>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`# This code is based on the algorithm presented in the paper "A Stemming`
			`# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and`
			`# Christian Huyck, which unfortunately I had no access to. The code is a`
			`# Python version, with some minor modifications of mine, to the description`
			`# presented at http://www.webcitation.org/5NnvdIzOb and to the C source code`
			`# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.`
			`# Please note that this stemmer is intended for demonstration and educational`
			`# purposes only. Feel free to write me for any comments, including the`
			`# development of a different and/or better stemmer for Portuguese. I also`
			`# suggest using NLTK's mailing list for Portuguese for any discussion.`

			`# Este código é baseado no algoritmo apresentado no artigo "A Stemming`
			`# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e`
			`# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O`
			`# código é uma conversão para Python, com algumas pequenas modificações`
			`# minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do`
			`# código para linguagem C disponível em`
			`# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,`
			`# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente`
			`# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer`
			`# comentário, inclusive sobre o desenvolvimento de um stemmer diferente`
			`# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão`
			`# do NLTK para o português para qualquer debate.`
			`from __future__ import print_function, unicode_literals`
			`from nltk.data import load`

			`from nltk.stem.api import StemmerI`

			`class RSLPStemmer(StemmerI):`
			`"""`
			`A stemmer for Portuguese.`

			`>>> from nltk.stem import RSLPStemmer`
			`>>> st = RSLPStemmer()`
			`>>> # opening lines of Erico Verissimo's "Música ao Longe"`
			`>>> text = '''`
			`... Clarissa risca com giz no quadro-negro a paisagem que os alunos`
			`... devem copiar . Uma casinha de porta e janela , em cima duma`
			`... coxilha .'''`
			`>>> for token in text.split():`
			`... print(st.stem(token))`
			`clariss risc com giz no quadro-negr a pais que os alun dev copi .`
			`uma cas de port e janel , em cim dum coxilh .`
			`"""`

			`def __init__ (self):`
			`self._model = []`

			`self._model.append( self.read_rule("step0.pt") )`
			`self._model.append( self.read_rule("step1.pt") )`
			`self._model.append( self.read_rule("step2.pt") )`
			`self._model.append( self.read_rule("step3.pt") )`
			`self._model.append( self.read_rule("step4.pt") )`
			`self._model.append( self.read_rule("step5.pt") )`
			`self._model.append( self.read_rule("step6.pt") )`

			`def read_rule (self, filename):`
			`rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")`
			`lines = rules.split("\n")`

			`lines = [line for line in lines if line != ""] # remove blank lines`
			`lines = [line for line in lines if line[0] != "#"] # remove comments`

			`# NOTE: a simple but ugly hack to make this parser happy with double '\t's`
			`lines = [line.replace("\t\t", "\t") for line in lines]`

			`# parse rules`
			`rules = []`
			`for line in lines:`
			`rule = []`
			`tokens = line.split("\t")`

			`# text to be searched for at the end of the string`
			`rule.append( tokens[0][1:-1] ) # remove quotes`

			`# minimum stem size to perform the replacement`
			`rule.append( int(tokens[1]) )`

			`# text to be replaced into`
			`rule.append( tokens[2][1:-1] ) # remove quotes`

			`# exceptions to this rule`
			`rule.append( [token[1:-1] for token in tokens[3].split(",")] )`

			`# append to the results`
			`rules.append(rule)`

			`return rules`

			`def stem(self, word):`
			`word = word.lower()`

			`# the word ends in 's'? apply rule for plural reduction`
			`if word[-1] == "s":`
			`word = self.apply_rule(word, 0)`

			`# the word ends in 'a'? apply rule for feminine reduction`
			`if word[-1] == "a":`
			`word = self.apply_rule(word, 1)`

			`# augmentative reduction`
			`word = self.apply_rule(word, 3)`

			`# adverb reduction`
			`word = self.apply_rule(word, 2)`

			`# noun reduction`
			`prev_word = word`
			`word = self.apply_rule(word, 4)`
			`if word == prev_word:`
			`# verb reduction`
			`prev_word = word`
			`word = self.apply_rule(word, 5)`
			`if word == prev_word:`
			`# vowel removal`
			`word = self.apply_rule(word, 6)`

			`return word`

			`def apply_rule(self, word, rule_index):`
			`rules = self._model[rule_index]`
			`for rule in rules:`
			`suffix_length = len(rule[0])`
			`if word[-suffix_length:] == rule[0]: # if suffix matches`
			`if len(word) >= suffix_length + rule[1]: # if we have minimum size`
			`if word not in rule[3]: # if not an exception`
			`word = word[:-suffix_length] + rule[2]`
			`break`

			`return word`