// An implementation of the "Porter Stemmer for Bahasa Indonesia" from: // http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf integers ( // The paper defines measure as the number of vowels in the word. We // count this initially, then adjust the count each time we remove a // prefix or suffix. measure // Numeric code for the type of prefix removed: // // 0 other/none // 1 'di' or 'meng' or 'ter' // 2 'per' // 3 'ke' or 'peng' // 4 'ber' // // Some of these have variant forms, so e.g. "meng" includes "men", "me", // "meny", "mem". // // Note that the value of prefix is only used in remove_suffix (and // routines it calls) so we don't need to worry about // remove_second_order_prefix overwriting a value of prefix set by // remove_first_order_prefix since remove_suffix gets called between // the two. prefix ) groupings ( vowel ) routines ( remove_particle remove_possessive_pronoun remove_first_order_prefix remove_second_order_prefix remove_suffix ) externals ( stem ) stringescapes {} backwardmode ( define remove_particle as ( [substring] among ( 'kah' 'lah' 'pun' (delete $measure-=1) ) ) define remove_possessive_pronoun as ( [substring] among ( 'ku' 'mu' 'nya' (delete $measure-=1) ) ) define remove_suffix as ( [substring] among ( 'an' ( ( // prefix not in {ke, peng, per}. [See SUFFIX_KAN_NOTE] ($prefix != 3 and $prefix != 2) // Remove suffix 'kan' 'k'] ) or ( // prefix not in {di, meng, ter} $prefix != 1 ) ) 'i' ( // prefix not in {ke, peng, ber} $prefix <= 2 // word does not end '-si'. [See SUFFIX_I_NOTE] not 's' ) ) delete $measure-=1 ) // SUFFIX_KAN_NOTE: // On page 29, the example "kompas Q.31" says "Both Nazief and Porter // stemmer converted the word peledakan (blast, explotion [sic]) to // ledak (to blast, to explode)". However, the algorithm as described // doesn't behave in this way - grammatically the prefix pe- occurs as a // variation of both the first-order derivational prefix peng- and the // second-order derivational prefix per-, but table 2.5 doesn't include // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly) // as having prefix "per" not "peng", and so we remove derivational // suffix "kan" rather than "an" to give stem leda. (Porter-style // stemmers remove the longest suffix they can amongst those available, // which this paper notes in the last paragraph on page 15). // // We resolve this by amending the condition on suffix "kan" to // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's // behaviour match all the examples in the paper except for one: // "perbaikan" is shown in table 3.4 as stemming to "bai", but with // this change it now stems to "baik". The table notes that "baik" is // the actual root so this deviation is an improvement. In a sample // vocabulary derived from the most common words in id.wikipedia.org, // this change only affects 0.12% of words (76 out of 64,587, including // "peledakan" and "perbaikan"). // SUFFIX_I_NOTE: // The rest of the condition from the paper is: // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i // // The meaning of this is unclear in several ways, and none of the // examples given of the stemmer's behaviour in the paper help to // resolve these issues. // // Notice that c₂ isn't actually used - the most obvious explanation // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁". // // Elsewhere the paper defines V... as meaning "the stem starts with // a vowel" and K... as meaning "the stem starts with a consonant". // The meaning of | isn't actually defined, but clearly means // alternation. // // However nowhere is the precedence of | vs ... defined, and there // isn't a standard precedence we could reasonably assume. In other // places where the paper says X|Y... it seems the | binds more // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit // odd as the first letter must be either a vowel or a consonant, so // that really just means "ends cᵢcⱼ" (and has at least one letter // before cᵢ but we only call remove_suffix if $measure > 2 which // ensures that part). However, nowhere in the paper uses or defines // a notation such as ...X, which may explain this seemingly redundant // way of specifying this. // // The conditions elsewhere on prefix removal (e.g. V...) are clearly // on the stem left after the prefix is removed. None of the other // rules for suffix removal have conditions on the stem, but for // consistency with the prefix rules we might expect that the cᵢcⱼ // test is on what's left *after* removing the "i" suffix. // // Studying Indonesian wordlists and discussion with a native // speaker leads us to conclude that the purpose of this check is to // protect words of foreign origin (e.g. "televisi", "organisasi", // "komunikasi") from stemming, and the common feature of these is // that the word ends "-si", so we conclude that the condition here // should be read as "word does not end -si", and this is what we // have implemented. ) define vowel 'aeiou' define remove_first_order_prefix as ( [substring] among ( 'di' 'meng' 'me' 'ter' (delete $prefix=1 $measure-=1) 'men' ( ('y' test vowel ] <-'s' $prefix=1 $measure-=1) or (delete $prefix=1 $measure-=1) ) 'ke' 'peng' (delete $prefix=3 $measure-=1) 'pen' ( ('y' test vowel ] <-'s' $prefix=3 $measure-=1) or (delete $prefix=3 $measure-=1) ) 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete) 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete) ) ) define remove_second_order_prefix as ( // The paper has the condition on removal of prefix "bel" and "pel" as // just "ajar" not "ajar..." but it seems that the latter must be what // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". // This change only affects a very small number of words (11 out of // 64,587) and only for the better. [among ( 'pe' ( ( 'r'] $prefix=2 ) or ( 'l'] 'ajar' ) or ( ] $prefix=2 ) ) 'be' ( ( 'r'] ) or ( 'l'] 'ajar') or ( ] non-vowel 'er' ) $prefix=4 ) ) // All prefixes we remove here contain exactly one vowel. $measure-=1 delete ) define stem as ( $measure = 0 do ( repeat ( gopast vowel $measure+=1 ) ) $measure > 2 $prefix = 0 backwards ( do remove_particle $measure > 2 do remove_possessive_pronoun ) $measure > 2 test ( remove_first_order_prefix do ( test ($measure > 2 backwards remove_suffix) $measure > 2 remove_second_order_prefix ) ) or ( do remove_second_order_prefix do ($measure > 2 backwards remove_suffix) ) )