710 lines
28 KiB
Python
710 lines
28 KiB
Python
"""
|
|
Porter Stemmer
|
|
|
|
This is the Porter stemming algorithm. It follows the algorithm
|
|
presented in
|
|
|
|
Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
|
|
|
|
with some optional deviations that can be turned on or off with the
|
|
`mode` argument to the constructor.
|
|
|
|
Martin Porter, the algorithm's inventor, maintains a web page about the
|
|
algorithm at
|
|
|
|
http://www.tartarus.org/~martin/PorterStemmer/
|
|
|
|
which includes another Python implementation and other implementations
|
|
in many languages.
|
|
"""
|
|
|
|
from __future__ import print_function, unicode_literals
|
|
|
|
__docformat__ = 'plaintext'
|
|
|
|
import re
|
|
|
|
from nltk.stem.api import StemmerI
|
|
from nltk.compat import python_2_unicode_compatible
|
|
|
|
@python_2_unicode_compatible
|
|
class PorterStemmer(StemmerI):
|
|
"""
|
|
A word stemmer based on the Porter stemming algorithm.
|
|
|
|
Porter, M. "An algorithm for suffix stripping."
|
|
Program 14.3 (1980): 130-137.
|
|
|
|
See http://www.tartarus.org/~martin/PorterStemmer/ for the homepage
|
|
of the algorithm.
|
|
|
|
Martin Porter has endorsed several modifications to the Porter
|
|
algorithm since writing his original paper, and those extensions are
|
|
included in the implementations on his website. Additionally, others
|
|
have proposed further improvements to the algorithm, including NLTK
|
|
contributors. There are thus three modes that can be selected by
|
|
passing the appropriate constant to the class constructor's `mode`
|
|
attribute:
|
|
|
|
PorterStemmer.ORIGINAL_ALGORITHM
|
|
- Implementation that is faithful to the original paper.
|
|
|
|
Note that Martin Porter has deprecated this version of the
|
|
algorithm. Martin distributes implementations of the Porter
|
|
Stemmer in many languages, hosted at:
|
|
|
|
http://www.tartarus.org/~martin/PorterStemmer/
|
|
|
|
and all of these implementations include his extensions. He
|
|
strongly recommends against using the original, published
|
|
version of the algorithm; only use this mode if you clearly
|
|
understand why you are choosing to do so.
|
|
|
|
PorterStemmer.MARTIN_EXTENSIONS
|
|
- Implementation that only uses the modifications to the
|
|
algorithm that are included in the implementations on Martin
|
|
Porter's website. He has declared Porter frozen, so the
|
|
behaviour of those implementations should never change.
|
|
|
|
PorterStemmer.NLTK_EXTENSIONS (default)
|
|
- Implementation that includes further improvements devised by
|
|
NLTK contributors or taken from other modified implementations
|
|
found on the web.
|
|
|
|
For the best stemming, you should use the default NLTK_EXTENSIONS
|
|
version. However, if you need to get the same results as either the
|
|
original algorithm or one of Martin Porter's hosted versions for
|
|
compability with an existing implementation or dataset, you can use
|
|
one of the other modes instead.
|
|
"""
|
|
|
|
# Modes the Stemmer can be instantiated in
|
|
NLTK_EXTENSIONS = 'NLTK_EXTENSIONS'
|
|
MARTIN_EXTENSIONS = 'MARTIN_EXTENSIONS'
|
|
ORIGINAL_ALGORITHM = 'ORIGINAL_ALGORITHM'
|
|
|
|
def __init__(self, mode=NLTK_EXTENSIONS):
|
|
if mode not in (
|
|
self.NLTK_EXTENSIONS,
|
|
self.MARTIN_EXTENSIONS,
|
|
self.ORIGINAL_ALGORITHM
|
|
):
|
|
raise ValueError(
|
|
"Mode must be one of PorterStemmer.NLTK_EXTENSIONS, "
|
|
"PorterStemmer.MARTIN_EXTENSIONS, or "
|
|
"PorterStemmer.ORIGINAL_ALGORITHM"
|
|
)
|
|
|
|
self.mode = mode
|
|
|
|
if self.mode == self.NLTK_EXTENSIONS:
|
|
# This is a table of irregular forms. It is quite short,
|
|
# but still reflects the errors actually drawn to Martin
|
|
# Porter's attention over a 20 year period!
|
|
irregular_forms = {
|
|
"sky" : ["sky", "skies"],
|
|
"die" : ["dying"],
|
|
"lie" : ["lying"],
|
|
"tie" : ["tying"],
|
|
"news" : ["news"],
|
|
"inning" : ["innings", "inning"],
|
|
"outing" : ["outings", "outing"],
|
|
"canning" : ["cannings", "canning"],
|
|
"howe" : ["howe"],
|
|
"proceed" : ["proceed"],
|
|
"exceed" : ["exceed"],
|
|
"succeed" : ["succeed"],
|
|
}
|
|
|
|
self.pool = {}
|
|
for key in irregular_forms:
|
|
for val in irregular_forms[key]:
|
|
self.pool[val] = key
|
|
|
|
self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
|
|
|
|
def _is_consonant(self, word, i):
|
|
"""Returns True if word[i] is a consonant, False otherwise
|
|
|
|
A consonant is defined in the paper as follows:
|
|
|
|
A consonant in a word is a letter other than A, E, I, O or
|
|
U, and other than Y preceded by a consonant. (The fact that
|
|
the term `consonant' is defined to some extent in terms of
|
|
itself does not make it ambiguous.) So in TOY the consonants
|
|
are T and Y, and in SYZYGY they are S, Z and G. If a letter
|
|
is not a consonant it is a vowel.
|
|
"""
|
|
if word[i] in self.vowels:
|
|
return False
|
|
if word[i] == 'y':
|
|
if i == 0:
|
|
return True
|
|
else:
|
|
return (not self._is_consonant(word, i - 1))
|
|
return True
|
|
|
|
def _measure(self, stem):
|
|
"""Returns the 'measure' of stem, per definition in the paper
|
|
|
|
From the paper:
|
|
|
|
A consonant will be denoted by c, a vowel by v. A list
|
|
ccc... of length greater than 0 will be denoted by C, and a
|
|
list vvv... of length greater than 0 will be denoted by V.
|
|
Any word, or part of a word, therefore has one of the four
|
|
forms:
|
|
|
|
CVCV ... C
|
|
CVCV ... V
|
|
VCVC ... C
|
|
VCVC ... V
|
|
|
|
These may all be represented by the single form
|
|
|
|
[C]VCVC ... [V]
|
|
|
|
where the square brackets denote arbitrary presence of their
|
|
contents. Using (VC){m} to denote VC repeated m times, this
|
|
may again be written as
|
|
|
|
[C](VC){m}[V].
|
|
|
|
m will be called the \measure\ of any word or word part when
|
|
represented in this form. The case m = 0 covers the null
|
|
word. Here are some examples:
|
|
|
|
m=0 TR, EE, TREE, Y, BY.
|
|
m=1 TROUBLE, OATS, TREES, IVY.
|
|
m=2 TROUBLES, PRIVATE, OATEN, ORRERY.
|
|
"""
|
|
cv_sequence = ''
|
|
|
|
# Construct a string of 'c's and 'v's representing whether each
|
|
# character in `stem` is a consonant or a vowel.
|
|
# e.g. 'falafel' becomes 'cvcvcvc',
|
|
# 'architecture' becomes 'vcccvcvccvcv'
|
|
for i in range(len(stem)):
|
|
if self._is_consonant(stem, i):
|
|
cv_sequence += 'c'
|
|
else:
|
|
cv_sequence += 'v'
|
|
|
|
# Count the number of 'vc' occurences, which is equivalent to
|
|
# the number of 'VC' occurrences in Porter's reduced form in the
|
|
# docstring above, which is in turn equivalent to `m`
|
|
return cv_sequence.count('vc')
|
|
|
|
def _has_positive_measure(self, stem):
|
|
return self._measure(stem) > 0
|
|
|
|
def _contains_vowel(self, stem):
|
|
"""Returns True if stem contains a vowel, else False"""
|
|
for i in range(len(stem)):
|
|
if not self._is_consonant(stem, i):
|
|
return True
|
|
return False
|
|
|
|
def _ends_double_consonant(self, word):
|
|
"""Implements condition *d from the paper
|
|
|
|
Returns True if word ends with a double consonant
|
|
"""
|
|
return (
|
|
len(word) >= 2 and
|
|
word[-1] == word[-2] and
|
|
self._is_consonant(word, len(word)-1)
|
|
)
|
|
|
|
def _ends_cvc(self, word):
|
|
"""Implements condition *o from the paper
|
|
|
|
From the paper:
|
|
|
|
*o - the stem ends cvc, where the second c is not W, X or Y
|
|
(e.g. -WIL, -HOP).
|
|
"""
|
|
return (
|
|
len(word) >= 3 and
|
|
self._is_consonant(word, len(word) - 3) and
|
|
not self._is_consonant(word, len(word) - 2) and
|
|
self._is_consonant(word, len(word) - 1) and
|
|
word[-1] not in ('w', 'x', 'y')
|
|
) or (
|
|
self.mode == self.NLTK_EXTENSIONS and
|
|
len(word) == 2 and
|
|
not self._is_consonant(word, 0) and
|
|
self._is_consonant(word, 1)
|
|
)
|
|
|
|
def _replace_suffix(self, word, suffix, replacement):
|
|
"""Replaces `suffix` of `word` with `replacement"""
|
|
assert word.endswith(suffix), "Given word doesn't end with given suffix"
|
|
if suffix == '':
|
|
return word + replacement
|
|
else:
|
|
return word[:-len(suffix)] + replacement
|
|
|
|
def _apply_rule_list(self, word, rules):
|
|
"""Applies the first applicable suffix-removal rule to the word
|
|
|
|
Takes a word and a list of suffix-removal rules represented as
|
|
3-tuples, with the first element being the suffix to remove,
|
|
the second element being the string to replace it with, and the
|
|
final element being the condition for the rule to be applicable,
|
|
or None if the rule is unconditional.
|
|
"""
|
|
for rule in rules:
|
|
suffix, replacement, condition = rule
|
|
if suffix == '*d' and self._ends_double_consonant(word):
|
|
stem = word[:-2]
|
|
if condition is None or condition(stem):
|
|
return stem + replacement
|
|
else:
|
|
# Don't try any further rules
|
|
return word
|
|
if word.endswith(suffix):
|
|
stem = self._replace_suffix(word, suffix, '')
|
|
if condition is None or condition(stem):
|
|
return stem + replacement
|
|
else:
|
|
# Don't try any further rules
|
|
return word
|
|
|
|
return word
|
|
|
|
def _step1a(self, word):
|
|
"""Implements Step 1a from "An algorithm for suffix stripping"
|
|
|
|
From the paper:
|
|
|
|
SSES -> SS caresses -> caress
|
|
IES -> I ponies -> poni
|
|
ties -> ti
|
|
SS -> SS caress -> caress
|
|
S -> cats -> cat
|
|
"""
|
|
# this NLTK-only rule extends the original algorithm, so
|
|
# that 'flies'->'fli' but 'dies'->'die' etc
|
|
if self.mode == self.NLTK_EXTENSIONS:
|
|
if word.endswith('ies') and len(word) == 4:
|
|
return self._replace_suffix(word, 'ies', 'ie')
|
|
|
|
return self._apply_rule_list(word, [
|
|
('sses', 'ss', None), # SSES -> SS
|
|
('ies', 'i', None), # IES -> I
|
|
('ss', 'ss', None), # SS -> SS
|
|
('s', '', None), # S ->
|
|
])
|
|
|
|
def _step1b(self, word):
|
|
"""Implements Step 1b from "An algorithm for suffix stripping"
|
|
|
|
From the paper:
|
|
|
|
(m>0) EED -> EE feed -> feed
|
|
agreed -> agree
|
|
(*v*) ED -> plastered -> plaster
|
|
bled -> bled
|
|
(*v*) ING -> motoring -> motor
|
|
sing -> sing
|
|
|
|
If the second or third of the rules in Step 1b is successful,
|
|
the following is done:
|
|
|
|
AT -> ATE conflat(ed) -> conflate
|
|
BL -> BLE troubl(ed) -> trouble
|
|
IZ -> IZE siz(ed) -> size
|
|
(*d and not (*L or *S or *Z))
|
|
-> single letter
|
|
hopp(ing) -> hop
|
|
tann(ed) -> tan
|
|
fall(ing) -> fall
|
|
hiss(ing) -> hiss
|
|
fizz(ed) -> fizz
|
|
(m=1 and *o) -> E fail(ing) -> fail
|
|
fil(ing) -> file
|
|
|
|
The rule to map to a single letter causes the removal of one of
|
|
the double letter pair. The -E is put back on -AT, -BL and -IZ,
|
|
so that the suffixes -ATE, -BLE and -IZE can be recognised
|
|
later. This E may be removed in step 4.
|
|
"""
|
|
# this NLTK-only block extends the original algorithm, so that
|
|
# 'spied'->'spi' but 'died'->'die' etc
|
|
if self.mode == self.NLTK_EXTENSIONS:
|
|
if word.endswith('ied'):
|
|
if len(word) == 4:
|
|
return self._replace_suffix(word, 'ied', 'ie')
|
|
else:
|
|
return self._replace_suffix(word, 'ied', 'i')
|
|
|
|
# (m>0) EED -> EE
|
|
if word.endswith('eed'):
|
|
stem = self._replace_suffix(word, 'eed', '')
|
|
if self._measure(stem) > 0:
|
|
return stem + 'ee'
|
|
else:
|
|
return word
|
|
|
|
rule_2_or_3_succeeded = False
|
|
|
|
for suffix in ['ed', 'ing']:
|
|
if word.endswith(suffix):
|
|
intermediate_stem = self._replace_suffix(word, suffix, '')
|
|
if self._contains_vowel(intermediate_stem):
|
|
rule_2_or_3_succeeded = True
|
|
break
|
|
|
|
if not rule_2_or_3_succeeded:
|
|
return word
|
|
|
|
return self._apply_rule_list(intermediate_stem, [
|
|
('at', 'ate', None), # AT -> ATE
|
|
('bl', 'ble', None), # BL -> BLE
|
|
('iz', 'ize', None), # IZ -> IZE
|
|
# (*d and not (*L or *S or *Z))
|
|
# -> single letter
|
|
(
|
|
'*d',
|
|
intermediate_stem[-1],
|
|
lambda stem: intermediate_stem[-1] not in ('l', 's', 'z')
|
|
),
|
|
# (m=1 and *o) -> E
|
|
(
|
|
'',
|
|
'e',
|
|
lambda stem: (self._measure(stem) == 1 and
|
|
self._ends_cvc(stem))
|
|
),
|
|
])
|
|
|
|
def _step1c(self, word):
|
|
"""Implements Step 1c from "An algorithm for suffix stripping"
|
|
|
|
From the paper:
|
|
|
|
Step 1c
|
|
|
|
(*v*) Y -> I happy -> happi
|
|
sky -> sky
|
|
"""
|
|
def nltk_condition(stem):
|
|
"""
|
|
This has been modified from the original Porter algorithm so
|
|
that y->i is only done when y is preceded by a consonant,
|
|
but not if the stem is only a single consonant, i.e.
|
|
|
|
(*c and not c) Y -> I
|
|
|
|
So 'happy' -> 'happi', but
|
|
'enjoy' -> 'enjoy' etc
|
|
|
|
This is a much better rule. Formerly 'enjoy'->'enjoi' and
|
|
'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but
|
|
with this modification that no longer really matters.
|
|
|
|
Also, the removal of the contains_vowel(z) condition means
|
|
that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and
|
|
conflate with 'spied', 'tried', 'flies' ...
|
|
"""
|
|
return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1)
|
|
|
|
def original_condition(stem):
|
|
return self._contains_vowel(stem)
|
|
|
|
return self._apply_rule_list(word, [
|
|
(
|
|
'y',
|
|
'i',
|
|
nltk_condition if self.mode == self.NLTK_EXTENSIONS
|
|
else original_condition
|
|
)
|
|
])
|
|
|
|
def _step2(self, word):
|
|
"""Implements Step 2 from "An algorithm for suffix stripping"
|
|
|
|
From the paper:
|
|
|
|
Step 2
|
|
|
|
(m>0) ATIONAL -> ATE relational -> relate
|
|
(m>0) TIONAL -> TION conditional -> condition
|
|
rational -> rational
|
|
(m>0) ENCI -> ENCE valenci -> valence
|
|
(m>0) ANCI -> ANCE hesitanci -> hesitance
|
|
(m>0) IZER -> IZE digitizer -> digitize
|
|
(m>0) ABLI -> ABLE conformabli -> conformable
|
|
(m>0) ALLI -> AL radicalli -> radical
|
|
(m>0) ENTLI -> ENT differentli -> different
|
|
(m>0) ELI -> E vileli - > vile
|
|
(m>0) OUSLI -> OUS analogousli -> analogous
|
|
(m>0) IZATION -> IZE vietnamization -> vietnamize
|
|
(m>0) ATION -> ATE predication -> predicate
|
|
(m>0) ATOR -> ATE operator -> operate
|
|
(m>0) ALISM -> AL feudalism -> feudal
|
|
(m>0) IVENESS -> IVE decisiveness -> decisive
|
|
(m>0) FULNESS -> FUL hopefulness -> hopeful
|
|
(m>0) OUSNESS -> OUS callousness -> callous
|
|
(m>0) ALITI -> AL formaliti -> formal
|
|
(m>0) IVITI -> IVE sensitiviti -> sensitive
|
|
(m>0) BILITI -> BLE sensibiliti -> sensible
|
|
"""
|
|
|
|
if self.mode == self.NLTK_EXTENSIONS:
|
|
# Instead of applying the ALLI -> AL rule after '(a)bli' per
|
|
# the published algorithm, instead we apply it first, and,
|
|
# if it succeeds, run the result through step2 again.
|
|
if (
|
|
word.endswith('alli') and
|
|
self._has_positive_measure(
|
|
self._replace_suffix(word, 'alli', '')
|
|
)
|
|
):
|
|
return self._step2(
|
|
self._replace_suffix(word, 'alli', 'al')
|
|
)
|
|
|
|
bli_rule = ('bli', 'ble', self._has_positive_measure)
|
|
abli_rule = ('abli', 'able', self._has_positive_measure)
|
|
|
|
rules = [
|
|
('ational', 'ate', self._has_positive_measure),
|
|
('tional', 'tion', self._has_positive_measure),
|
|
('enci', 'ence', self._has_positive_measure),
|
|
('anci', 'ance', self._has_positive_measure),
|
|
('izer', 'ize', self._has_positive_measure),
|
|
|
|
abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule,
|
|
|
|
('alli', 'al', self._has_positive_measure),
|
|
('entli', 'ent', self._has_positive_measure),
|
|
('eli', 'e', self._has_positive_measure),
|
|
('ousli', 'ous', self._has_positive_measure),
|
|
('ization', 'ize', self._has_positive_measure),
|
|
('ation', 'ate', self._has_positive_measure),
|
|
('ator', 'ate', self._has_positive_measure),
|
|
('alism', 'al', self._has_positive_measure),
|
|
('iveness', 'ive', self._has_positive_measure),
|
|
('fulness', 'ful', self._has_positive_measure),
|
|
('ousness', 'ous', self._has_positive_measure),
|
|
('aliti', 'al', self._has_positive_measure),
|
|
('iviti', 'ive', self._has_positive_measure),
|
|
('biliti', 'ble', self._has_positive_measure),
|
|
]
|
|
|
|
if self.mode == self.NLTK_EXTENSIONS:
|
|
rules.append(
|
|
('fulli', 'ful', self._has_positive_measure)
|
|
)
|
|
|
|
# The 'l' of the 'logi' -> 'log' rule is put with the stem,
|
|
# so that short stems like 'geo' 'theo' etc work like
|
|
# 'archaeo' 'philo' etc.
|
|
rules.append((
|
|
"logi",
|
|
"log",
|
|
lambda stem: self._has_positive_measure(word[:-3])
|
|
))
|
|
|
|
if self.mode == self.MARTIN_EXTENSIONS:
|
|
rules.append(
|
|
("logi", "log", self._has_positive_measure)
|
|
)
|
|
|
|
return self._apply_rule_list(word, rules)
|
|
|
|
def _step3(self, word):
|
|
"""Implements Step 3 from "An algorithm for suffix stripping"
|
|
|
|
From the paper:
|
|
|
|
Step 3
|
|
|
|
(m>0) ICATE -> IC triplicate -> triplic
|
|
(m>0) ATIVE -> formative -> form
|
|
(m>0) ALIZE -> AL formalize -> formal
|
|
(m>0) ICITI -> IC electriciti -> electric
|
|
(m>0) ICAL -> IC electrical -> electric
|
|
(m>0) FUL -> hopeful -> hope
|
|
(m>0) NESS -> goodness -> good
|
|
"""
|
|
return self._apply_rule_list(word, [
|
|
('icate', 'ic', self._has_positive_measure),
|
|
('ative', '', self._has_positive_measure),
|
|
('alize', 'al', self._has_positive_measure),
|
|
('iciti', 'ic', self._has_positive_measure),
|
|
('ical', 'ic', self._has_positive_measure),
|
|
('ful', '', self._has_positive_measure),
|
|
('ness', '', self._has_positive_measure),
|
|
])
|
|
|
|
def _step4(self, word):
|
|
"""Implements Step 4 from "An algorithm for suffix stripping"
|
|
|
|
Step 4
|
|
|
|
(m>1) AL -> revival -> reviv
|
|
(m>1) ANCE -> allowance -> allow
|
|
(m>1) ENCE -> inference -> infer
|
|
(m>1) ER -> airliner -> airlin
|
|
(m>1) IC -> gyroscopic -> gyroscop
|
|
(m>1) ABLE -> adjustable -> adjust
|
|
(m>1) IBLE -> defensible -> defens
|
|
(m>1) ANT -> irritant -> irrit
|
|
(m>1) EMENT -> replacement -> replac
|
|
(m>1) MENT -> adjustment -> adjust
|
|
(m>1) ENT -> dependent -> depend
|
|
(m>1 and (*S or *T)) ION -> adoption -> adopt
|
|
(m>1) OU -> homologou -> homolog
|
|
(m>1) ISM -> communism -> commun
|
|
(m>1) ATE -> activate -> activ
|
|
(m>1) ITI -> angulariti -> angular
|
|
(m>1) OUS -> homologous -> homolog
|
|
(m>1) IVE -> effective -> effect
|
|
(m>1) IZE -> bowdlerize -> bowdler
|
|
|
|
The suffixes are now removed. All that remains is a little
|
|
tidying up.
|
|
"""
|
|
measure_gt_1 = lambda stem: self._measure(stem) > 1
|
|
|
|
return self._apply_rule_list(word, [
|
|
('al', '', measure_gt_1),
|
|
('ance', '', measure_gt_1),
|
|
('ence', '', measure_gt_1),
|
|
('er', '', measure_gt_1),
|
|
('ic', '', measure_gt_1),
|
|
('able', '', measure_gt_1),
|
|
('ible', '', measure_gt_1),
|
|
('ant', '', measure_gt_1),
|
|
('ement', '', measure_gt_1),
|
|
('ment', '', measure_gt_1),
|
|
('ent', '', measure_gt_1),
|
|
|
|
# (m>1 and (*S or *T)) ION ->
|
|
(
|
|
'ion',
|
|
'',
|
|
lambda stem: self._measure(stem) > 1 and stem[-1] in ('s', 't')
|
|
),
|
|
|
|
('ou', '', measure_gt_1),
|
|
('ism', '', measure_gt_1),
|
|
('ate', '', measure_gt_1),
|
|
('iti', '', measure_gt_1),
|
|
('ous', '', measure_gt_1),
|
|
('ive', '', measure_gt_1),
|
|
('ize', '', measure_gt_1),
|
|
])
|
|
|
|
def _step5a(self, word):
|
|
"""Implements Step 5a from "An algorithm for suffix stripping"
|
|
|
|
From the paper:
|
|
|
|
Step 5a
|
|
|
|
(m>1) E -> probate -> probat
|
|
rate -> rate
|
|
(m=1 and not *o) E -> cease -> ceas
|
|
"""
|
|
# Note that Martin's test vocabulary and reference
|
|
# implementations are inconsistent in how they handle the case
|
|
# where two rules both refer to a suffix that matches the word
|
|
# to be stemmed, but only the condition of the second one is
|
|
# true.
|
|
# Earlier in step2b we had the rules:
|
|
# (m>0) EED -> EE
|
|
# (*v*) ED ->
|
|
# but the examples in the paper included "feed"->"feed", even
|
|
# though (*v*) is true for "fe" and therefore the second rule
|
|
# alone would map "feed"->"fe".
|
|
# However, in THIS case, we need to handle the consecutive rules
|
|
# differently and try both conditions (obviously; the second
|
|
# rule here would be redundant otherwise). Martin's paper makes
|
|
# no explicit mention of the inconsistency; you have to infer it
|
|
# from the examples.
|
|
# For this reason, we can't use _apply_rule_list here.
|
|
if word.endswith('e'):
|
|
stem = self._replace_suffix(word, 'e', '')
|
|
if self._measure(stem) > 1:
|
|
return stem
|
|
if self._measure(stem) == 1 and not self._ends_cvc(stem):
|
|
return stem
|
|
return word
|
|
|
|
def _step5b(self, word):
|
|
"""Implements Step 5a from "An algorithm for suffix stripping"
|
|
|
|
From the paper:
|
|
|
|
Step 5b
|
|
|
|
(m > 1 and *d and *L) -> single letter
|
|
controll -> control
|
|
roll -> roll
|
|
"""
|
|
return self._apply_rule_list(word, [
|
|
('ll', 'l', lambda stem: self._measure(word[:-1]) > 1)
|
|
])
|
|
|
|
def stem(self, word):
|
|
stem = word.lower()
|
|
|
|
if self.mode == self.NLTK_EXTENSIONS and word in self.pool:
|
|
return self.pool[word]
|
|
|
|
if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2:
|
|
# With this line, strings of length 1 or 2 don't go through
|
|
# the stemming process, although no mention is made of this
|
|
# in the published algorithm.
|
|
return word
|
|
|
|
stem = self._step1a(stem)
|
|
stem = self._step1b(stem)
|
|
stem = self._step1c(stem)
|
|
stem = self._step2(stem)
|
|
stem = self._step3(stem)
|
|
stem = self._step4(stem)
|
|
stem = self._step5a(stem)
|
|
stem = self._step5b(stem)
|
|
|
|
return stem
|
|
|
|
def __repr__(self):
|
|
return '<PorterStemmer>'
|
|
|
|
def demo():
|
|
"""
|
|
A demonstration of the porter stemmer on a sample from
|
|
the Penn Treebank corpus.
|
|
"""
|
|
|
|
from nltk.corpus import treebank
|
|
from nltk import stem
|
|
|
|
stemmer = stem.PorterStemmer()
|
|
|
|
orig = []
|
|
stemmed = []
|
|
for item in treebank.fileids()[:3]:
|
|
for (word, tag) in treebank.tagged_words(item):
|
|
orig.append(word)
|
|
stemmed.append(stemmer.stem(word))
|
|
|
|
# Convert the results to a string, and word-wrap them.
|
|
results = ' '.join(stemmed)
|
|
results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
|
|
|
|
# Convert the original to a string, and word wrap it.
|
|
original = ' '.join(orig)
|
|
original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
|
|
|
|
# Print the results.
|
|
print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
|
|
print(original)
|
|
print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
|
|
print(results)
|
|
print('*'*70)
|