/* Stemmer for Turkish
 * author: Evren (Kapusuz) Çilden
 * email: evren.kapusuz at gmail.com
 *
 * stems nominal verb suffixes
 * stems nominal inflections
 * more than one syllable word check
 * (y,n,s,U) context check
 * vowel harmony check
 * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
 *
 * The stemming algorithm is based on the paper "An Affix Stripping
 * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
 * Eşref Adalı (Proceedings of the IAESTED International Conference
 * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
 * Innsbruck, Austria
 *
 * Turkish is an agglutinative language and has a very rich morphological
 * structure. In Turkish, you can form many different words from a single stem
 * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
 * "You had been the doctor of him". The stem of the word is "doktor" and it
 * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
 * the append order of suffixes can be clearly described as FSMs.
 * The paper referenced above defines some FSMs for right to left
 * morphological analysis. I generated a method for constructing snowball
 * expressions from right to left FSMs for stemming suffixes.
 */

routines (
  append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
  check_vowel_harmony  // tests vowel harmony for suffixes
  is_reserved_word     // tests whether current string is a reserved word ('ad','soyad')
  mark_cAsInA          // nominal verb suffix
  mark_DA              // noun suffix
  mark_DAn             // noun suffix
  mark_DUr             // nominal verb suffix
  mark_ki              // noun suffix
  mark_lAr             // noun suffix, nominal verb suffix
  mark_lArI            // noun suffix
  mark_nA              // noun suffix
  mark_ncA             // noun suffix
  mark_ndA             // noun suffix
  mark_ndAn            // noun suffix
  mark_nU              // noun suffix
  mark_nUn             // noun suffix
  mark_nUz             // nominal verb suffix
  mark_sU              // noun suffix
  mark_sUn             // nominal verb suffix
  mark_sUnUz           // nominal verb suffix
  mark_possessives     // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
  mark_yA              // noun suffix
  mark_ylA             // noun suffix
  mark_yU              // noun suffix
  mark_yUm             // nominal verb suffix
  mark_yUz             // nominal verb suffix
  mark_yDU             // nominal verb suffix
  mark_yken            // nominal verb suffix
  mark_ymUs_           // nominal verb suffix
  mark_ysA             // nominal verb suffix

  mark_suffix_with_optional_y_consonant
  mark_suffix_with_optional_U_vowel
  mark_suffix_with_optional_n_consonant
  mark_suffix_with_optional_s_consonant

  more_than_one_syllable_word

  post_process_last_consonants
  postlude

  remove_proper_noun_suffix

  stem_nominal_verb_suffixes
  stem_noun_suffixes
  stem_suffix_chain_before_ki
)

stringescapes  { }

/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef cc  '{U+00E7}'  // LATIN SMALL LETTER C WITH CEDILLA
stringdef g~  '{U+011F}'  // LATIN SMALL LETTER G WITH BREVE
stringdef i   '{U+0131}'  // LATIN SMALL LETTER I WITHOUT DOT
stringdef o"  '{U+00F6}'  // LATIN SMALL LETTER O WITH DIAERESIS
stringdef sc  '{U+015F}'  // LATIN SMALL LETTER S WITH CEDILLA
stringdef u"  '{U+00FC}'  // LATIN SMALL LETTER U WITH DIAERESIS

booleans ( continue_stemming_noun_suffixes )

groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6 )

define vowel  'ae{i}io{o"}u{u"}'
define U      '{i}iu{u"}'

// the vowel grouping definitions below are used for checking vowel harmony
define vowel1 'a{i}ou'     // vowels that can end with suffixes containing 'a'
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
define vowel3 'a{i}'       // vowels that can end with suffixes containing '{i}'
define vowel4 'ei'         // vowels that can end with suffixes containing 'i'
define vowel5 'ou'         // vowels that can end with suffixes containing 'o' or 'u'
define vowel6 '{o"}{u"}'   // vowels that can end with suffixes containing '{o"}' or '{u"}'

externals  ( stem )

backwardmode (
  // checks vowel harmony for possible suffixes,
  // helps to detect whether the candidate for suffix applies to vowel harmony
  // this rule is added to prevent over stemming
  define check_vowel_harmony as (
    test
    (
      (goto vowel)   // if there is a vowel
      (
        ('a' goto vowel1) or
        ('e' goto vowel2) or
        ('{i}' goto vowel3) or
        ('i' goto vowel4) or
        ('o' goto vowel5) or
        ('{o"}' goto vowel6) or
        ('u' goto vowel5) or
        ('{u"}' goto vowel6)
      )
    )
  )

  // if the last consonant before suffix is vowel and n then advance and delete
  // if the last consonant before suffix is non vowel and n do nothing
  // if the last consonant before suffix is not n then only delete the suffix
  // assumption: slice beginning is set correctly
  define mark_suffix_with_optional_n_consonant as (
    ('n' (test vowel))
    or
    ((not 'n') test(next vowel))
  )

  // if the last consonant before suffix is vowel and s then advance and delete
  // if the last consonant before suffix is non vowel and s do nothing
  // if the last consonant before suffix is not s then only delete the suffix
  // assumption: slice beginning is set correctly
  define mark_suffix_with_optional_s_consonant as (
    ('s' (test vowel))
    or
    ((not 's') test(next vowel))
  )

  // if the last consonant before suffix is vowel and y then advance and delete
  // if the last consonant before suffix is non vowel and y do nothing
  // if the last consonant before suffix is not y then only delete the suffix
  // assumption: slice beginning is set correctly
  define mark_suffix_with_optional_y_consonant as (
    ('y' (test vowel))
    or
    ((not 'y') test(next vowel))
  )

  define mark_suffix_with_optional_U_vowel as (
    (U (test non-vowel))
    or
    ((not U) test(next non-vowel))
  )

  define mark_possessives as (
    among ('m{i}z' 'miz' 'muz' 'm{u"}z'
           'n{i}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
    (mark_suffix_with_optional_U_vowel)
  )

  define mark_sU as (
    check_vowel_harmony
    U
    (mark_suffix_with_optional_s_consonant)
  )

  define mark_lArI as (
    among ('leri' 'lar{i}')
  )

  define mark_yU as (
    check_vowel_harmony
    U
    (mark_suffix_with_optional_y_consonant)
  )

  define mark_nU as (
    check_vowel_harmony
    among ('n{i}' 'ni' 'nu' 'n{u"}')
  )

  define mark_nUn as (
    check_vowel_harmony
    among ('{i}n' 'in' 'un' '{u"}n')
    (mark_suffix_with_optional_n_consonant)
  )

  define mark_yA as (
    check_vowel_harmony
    among('a' 'e')
    (mark_suffix_with_optional_y_consonant)
  )

  define mark_nA as (
    check_vowel_harmony
    among('na' 'ne')
  )

  define mark_DA as (
    check_vowel_harmony
    among('da' 'de' 'ta' 'te')
  )

  define mark_ndA as (
    check_vowel_harmony
    among('nda' 'nde')
  )

  define mark_DAn as (
    check_vowel_harmony
    among('dan' 'den' 'tan' 'ten')
  )

  define mark_ndAn as (
    check_vowel_harmony
    among('ndan' 'nden')
  )

  define mark_ylA as (
    check_vowel_harmony
    among('la' 'le')
    (mark_suffix_with_optional_y_consonant)
  )

  define mark_ki as (
    'ki'
  )

  define mark_ncA as (
    check_vowel_harmony
    among('ca' 'ce')
    (mark_suffix_with_optional_n_consonant)
  )

  define mark_yUm as (
    check_vowel_harmony
    among ('{i}m' 'im' 'um' '{u"}m')
    (mark_suffix_with_optional_y_consonant)
  )

  define mark_sUn as (
    check_vowel_harmony
    among ('s{i}n' 'sin' 'sun' 's{u"}n' )
  )

  define mark_yUz as (
    check_vowel_harmony
    among ('{i}z' 'iz' 'uz' '{u"}z')
    (mark_suffix_with_optional_y_consonant)
  )

  define mark_sUnUz as (
    among ('s{i}n{i}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
  )

  define mark_lAr as (
    check_vowel_harmony
    among ('ler' 'lar')
  )

  define mark_nUz as (
    check_vowel_harmony
    among ('n{i}z' 'niz' 'nuz' 'n{u"}z')
  )

  define mark_DUr as (
    check_vowel_harmony
    among ('t{i}r' 'tir' 'tur' 't{u"}r' 'd{i}r' 'dir' 'dur' 'd{u"}r')
  )

  define mark_cAsInA as (
    among ('cas{i}na' 'cesine')
  )

  define mark_yDU as (
    check_vowel_harmony
    among ('t{i}m' 'tim' 'tum' 't{u"}m' 'd{i}m' 'dim' 'dum' 'd{u"}m'
      't{i}n' 'tin' 'tun' 't{u"}n' 'd{i}n' 'din' 'dun' 'd{u"}n'
      't{i}k' 'tik' 'tuk' 't{u"}k' 'd{i}k' 'dik' 'duk' 'd{u"}k'
      't{i}' 'ti' 'tu' 't{u"}' 'd{i}' 'di' 'du' 'd{u"}')
    (mark_suffix_with_optional_y_consonant)
  )

  // does not fully obey vowel harmony
  define mark_ysA as (
    among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
    (mark_suffix_with_optional_y_consonant)
  )

  define mark_ymUs_ as (
    check_vowel_harmony
    among ('m{i}{sc}' 'mi{sc}' 'mu{sc}' 'm{u"}{sc}')
    (mark_suffix_with_optional_y_consonant)
  )

  define mark_yken as (
    'ken' (mark_suffix_with_optional_y_consonant)
  )

  define stem_nominal_verb_suffixes as (
    [
      set continue_stemming_noun_suffixes
      (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
      or
      (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
      or
      (
        mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
        unset continue_stemming_noun_suffixes
      )
      or
      (mark_nUz (mark_yDU or mark_ysA))
      or
      ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
      or
      (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
    ]delete
  )

  // stems noun suffix chains ending with -ki
  define stem_suffix_chain_before_ki as (
    [
      mark_ki
      (
        (mark_DA] delete try([
          (mark_lAr] delete try(stem_suffix_chain_before_ki))
          or
          (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))

        ))
        or
        (mark_nUn] delete try([
          (mark_lArI] delete)
          or
          ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
          or
          (stem_suffix_chain_before_ki)
        ))
        or
        (mark_ndA (
          (mark_lArI] delete)
          or
          ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
          or
          (stem_suffix_chain_before_ki)
        ))
      )
  )

  define stem_noun_suffixes as (
    ([mark_lAr] delete try(stem_suffix_chain_before_ki))
    or
    ([mark_ncA] delete
      try(
        ([mark_lArI] delete)
        or
        ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
        or
        ([mark_lAr] delete stem_suffix_chain_before_ki)
      )
    )
    or
    ([(mark_ndA or mark_nA)
      (
        (mark_lArI] delete)
        or
        (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
        or
        (stem_suffix_chain_before_ki)
      )
    )
    or
    ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
    or
    ( [mark_DAn] delete try ([
      (
        (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
        or
        (mark_lAr] delete try(stem_suffix_chain_before_ki))
        or
        (stem_suffix_chain_before_ki)
      ))
    )
    or
    ([mark_nUn or mark_ylA] delete
      try(
        ([mark_lAr] delete stem_suffix_chain_before_ki)
        or
        ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
        or
        stem_suffix_chain_before_ki
      )
    )
    or
    ([mark_lArI] delete)
    or
    (stem_suffix_chain_before_ki)
    or
    ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
    or
    ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
  )

  define post_process_last_consonants as (
    [substring] among (
      'b' (<- 'p')
      'c' (<- '{cc}')
      'd' (<- 't')
      '{g~}' (<- 'k')
    )
  )

  // after stemming if the word ends with 'd' or 'g' most probably last U is
  // overstemmed like in 'kedim' -> 'ked'
  // Turkish words don't usually end with 'd' or 'g'
  // some very well known words are ignored (like 'ad' 'soyad'
  // appends U to stems ending with d or g, decides which vowel to add
  // based on the last vowel in the stem
  define append_U_to_stems_ending_with_d_or_g as (
    [] ('d' or 'g') goto vowel

    (('a' or '{i}') <- '{i}')
    or
    (('e' or 'i') <- 'i')
    or
    (('o' or 'u') <- 'u')
    or
    (('{o"}' or '{u"}') <- '{u"}')
  )

  define is_reserved_word as (
    'ad' try 'soy' atlimit
  )
)

define remove_proper_noun_suffix as (
  // Remove any leading apostrophes (e.g. from tokenisation of single-quoted
  // text).
  do ([goto not '{'}'] delete)

  // https://en.wikipedia.org/wiki/Turkish_language says "In modern
  // Turkish orthography, an apostrophe is used to separate proper names
  // from any suffixes" with the example "Türkiye'dir ("it is Turkey")".
  // Therefore we truncate at the first apostrophe, provided there are at least
  // two characters before it (which avoids adversely affecting some foreign
  // names and words such as "o'connor", "l'entrée").
  do (
    hop 2
    goto '{'}' [ tolimit ] delete
  )
)

// Test if there is more than one syllable.
// In Turkish each vowel indicates a distinct syllable.
define more_than_one_syllable_word as (
  test (loop 2 gopast vowel)
)

define postlude as (
  backwards (
    not is_reserved_word
    do append_U_to_stems_ending_with_d_or_g
    do post_process_last_consonants
  )
)

define stem as (
  do remove_proper_noun_suffix

  more_than_one_syllable_word

  backwards (
    do stem_nominal_verb_suffixes
    continue_stemming_noun_suffixes
    do stem_noun_suffixes
  )

  postlude
)