Coverage for nltk.stem.lancaster: 94%

100

101

102

103

# Natural Language Toolkit: Stemmers

# Author: Steven Tomcavage <stomcava@law.upenn.edu>

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

"""

A word stemmer based on the Lancaster stemming algorithm.

Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.

"""

import re

from .api import StemmerI

class LancasterStemmer(StemmerI):

"""

Lancaster Stemmer

>>> from nltk.stem.lancaster import LancasterStemmer

>>> st = LancasterStemmer()

>>> st.stem('maximum') # Remove "-um" when word is intact

'maxim'

>>> st.stem('presumably') # Don't remove "-um" when word is not intact

'presum'

>>> st.stem('multiply') # No action taken if word ends with "-ply"

'multiply'

>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules

'provid'

>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters

'ow'

>>> st.stem('ear') # ditto

'ear'

>>> st.stem('saying') # Words starting with consonant must contain at least 3

'say'

>>> st.stem('crying') # letters and one of those letters must be a vowel

'cry'

>>> st.stem('string') # ditto

'string'

>>> st.stem('meant') # ditto

'meant'

>>> st.stem('cement') # ditto

'cem'

"""

# The rule list is static since it doesn't change between instances

rule_tuple = (

"ai*2.", # -ia > - if intact

"a*1.", # -a > - if intact

"bb1.", # -bb > -b

"city3s.", # -ytic > -ys

"ci2>", # -ic > -

"cn1t>", # -nc > -nt

"dd1.", # -dd > -d

"dei3y>", # -ied > -y

"deec2ss.", # -ceed >", -cess

"dee1.", # -eed > -ee

"de2>", # -ed > -

"dooh4>", # -hood > -

"e1>", # -e > -

"feil1v.", # -lief > -liev

"fi2>", # -if > -

"gni3>", # -ing > -

"gai3y.", # -iag > -y

"ga2>", # -ag > -

"gg1.", # -gg > -g

"ht*2.", # -th > - if intact

"hsiug5ct.", # -guish > -ct

"hsi3>", # -ish > -

"i*1.", # -i > - if intact

"i1y>", # -i > -y

"ji1d.", # -ij > -id -- see nois4j> & vis3j>

"juf1s.", # -fuj > -fus

"ju1d.", # -uj > -ud

"jo1d.", # -oj > -od

"jeh1r.", # -hej > -her

"jrev1t.", # -verj > -vert

"jsim2t.", # -misj > -mit

"jn1d.", # -nj > -nd

"j1s.", # -j > -s

"lbaifi6.", # -ifiabl > -

"lbai4y.", # -iabl > -y

"lba3>", # -abl > -

"lbi3.", # -ibl > -

"lib2l>", # -bil > -bl

"lc1.", # -cl > c

"lufi4y.", # -iful > -y

"luf3>", # -ful > -

"lu2.", # -ul > -

"lai3>", # -ial > -

"lau3>", # -ual > -

"la2>", # -al > -

"ll1.", # -ll > -l

"mui3.", # -ium > -

"mu*2.", # -um > - if intact

"msi3>", # -ism > -

"mm1.", # -mm > -m

"nois4j>", # -sion > -j

"noix4ct.", # -xion > -ct

"noi3>", # -ion > -

"nai3>", # -ian > -

"na2>", # -an > -

"nee0.", # protect -een

"ne2>", # -en > -

"nn1.", # -nn > -n

"pihs4>", # -ship > -

"pp1.", # -pp > -p

"re2>", # -er > -

"rae0.", # protect -ear

"ra2.", # -ar > -

"ro2>", # -or > -

"ru2>", # -ur > -

"rr1.", # -rr > -r

"rt1>", # -tr > -t

"rei3y>", # -ier > -y

"sei3y>", # -ies > -y

"sis2.", # -sis > -s

"si2>", # -is > -

"ssen4>", # -ness > -

"ss0.", # protect -ss

"suo3>", # -ous > -

"su*2.", # -us > - if intact

"s*1>", # -s > - if intact

"s0.", # -s > -s

"tacilp4y.", # -plicat > -ply

"ta2>", # -at > -

"tnem4>", # -ment > -

"tne3>", # -ent > -

"tna3>", # -ant > -

"tpir2b.", # -ript > -rib

"tpro2b.", # -orpt > -orb

"tcud1.", # -duct > -duc

"tpmus2.", # -sumpt > -sum

"tpec2iv.", # -cept > -ceiv

"tulo2v.", # -olut > -olv

"tsis0.", # protect -sist

"tsi3>", # -ist > -

"tt1.", # -tt > -t

"uqi3.", # -iqu > -

"ugo1.", # -ogu > -og

"vis3j>", # -siv > -j

"vie0.", # protect -eiv

"vi2>", # -iv > -

"ylb1>", # -bly > -bl

"yli3y>", # -ily > -y

"ylp0.", # protect -ply

"yl2>", # -ly > -

"ygo1.", # -ogy > -og

"yhp1.", # -phy > -ph

"ymo1.", # -omy > -om

"ypo1.", # -opy > -op

"yti3>", # -ity > -

"yte3>", # -ety > -

"ytl2.", # -lty > -l

"yrtsi5.", # -istry > -

"yra3>", # -ary > -

"yro3>", # -ory > -

"yfi3.", # -ify > -

"ycn2t>", # -ncy > -nt

"yca3>", # -acy > -

"zi2>", # -iz > -

"zy1s." # -yz > -ys

)

def __init__(self):

"""Create an instance of the Lancaster stemmer.

"""

# Setup an empty rule dictionary - this will be filled in later

self.rule_dictionary = {}

def parseRules(self, rule_tuple):

"""Validate the set of rules used in this stemmer.

"""

valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")

# Empty any old rules from the rule set before adding new ones

self.rule_dictionary = {}

for rule in rule_tuple:

if not valid_rule.match(rule):

raise ValueError("The rule %s is invalid" % rule)

first_letter = rule[0:1]

if first_letter in self.rule_dictionary:

self.rule_dictionary[first_letter].append(rule)

else:

self.rule_dictionary[first_letter] = [rule]

def stem(self, word):

"""Stem a word using the Lancaster stemmer.

"""

# Lower-case the word, since all the rules are lower-cased

word = word.lower()

# Save a copy of the original word

intact_word = word

# If the user hasn't supplied any rules, setup the default rules

if len(self.rule_dictionary) == 0:

self.parseRules(LancasterStemmer.rule_tuple)

return self.__doStemming(word, intact_word)

def __doStemming(self, word, intact_word):

"""Perform the actual word stemming

"""

valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")

proceed = True

while proceed:

# Find the position of the last letter of the word to be stemmed

last_letter_position = self.__getLastLetter(word)

# Only stem the word if it has a last letter and a rule matching that last letter

if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary:

proceed = False

else:

rule_was_applied = False

# Go through each rule that matches the word's final letter

for rule in self.rule_dictionary[word[last_letter_position]]:

rule_match = valid_rule.match(rule)

if rule_match:

(ending_string,

intact_flag,

remove_total,

append_string,

cont_flag) = rule_match.groups()

# Convert the number of chars to remove when stemming

# from a string to an integer

remove_total = int(remove_total)

# Proceed if word's ending matches rule's word ending

if word.endswith(ending_string[::-1]):

if intact_flag:

if (word == intact_word and

self.__isAcceptable(word, remove_total)):

word = self.__applyRule(word,

remove_total,

append_string)

rule_was_applied = True

if cont_flag == '.':

proceed = False

break

elif self.__isAcceptable(word, remove_total):

word = self.__applyRule(word,

remove_total,

append_string)

rule_was_applied = True

if cont_flag == '.':

proceed = False

break

# If no rules apply, the word doesn't need any more stemming

if rule_was_applied == False:

proceed = False

return word

def __getLastLetter(self, word):

"""Get the zero-based index of the last alphabetic character in this string

"""

last_letter = -1

for position in range(len(word)):

if word[position].isalpha():

last_letter = position

else:

break

return last_letter

def __isAcceptable(self, word, remove_total):

"""Determine if the word is acceptable for stemming.

"""

word_is_acceptable = False

# If the word starts with a vowel, it must be at least 2

# characters long to be stemmed

if word[0] in "aeiouy":

if (len(word) - remove_total >= 2):

word_is_acceptable = True

# If the word starts with a consonant, it must be at least 3

# characters long (including one vowel) to be stemmed

elif (len(word) - remove_total >= 3):

if word[1] in "aeiouy":

word_is_acceptable = True

elif word[2] in "aeiouy":

word_is_acceptable = True

return word_is_acceptable

def __applyRule(self, word, remove_total, append_string):

"""Apply the stemming rule to the word

"""

# Remove letters from the end of the word

new_word_length = len(word) - remove_total

word = word[0:new_word_length]

# And add new letters to the end of the truncated word

if append_string:

word += append_string

return word

def __repr__(self):

return '<LancasterStemmer>'

if __name__ == "__main__":

import doctest

doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)

Coverage for nltk.stem.lancaster : 94%

82 statements 77 run 5 missing 0 excluded