/*
 * -----------------------------------------------------------------------------
 * Persian Stemmer (HPS-like)
 * -----------------------------------------------------------------------------
 * Based on the paper:
 *   "HPS: A Hierarchical Persian Stemming Method"
 *   Ayshe Rashidi, Mina Zolfy Lighvan (2014)
 * -----------------------------------------------------------------------------
 * Differences from original HPS:
 * 1) No POS-tagger stage:
 *      - HPS uses a POS tagger to route tokens to Noun, Adjective, or Verb
 *        suffix rules directly.
 *      - Here we fall back to the fixed order: Noun or Adjective, then Verb
 *        when POS is unknown.
 *      Reason: Implementation is tagger-agnostic and lightweight.
 * 2) Lexical "-AN" guard:
 *      - Added `Protect_Lexical_AN` to avoid stripping true lexical endings
 *        like "...stan", "...ran", and known stems (Iran, Tehran, Ensan).
 *      Reason: reduces over-stemming where HPS relied on explicit hash lists.
 * -----------------------------------------------------------------------------
 * Implemented by: https://saeiddrv.com
 * -----------------------------------------------------------------------------
 */

stringescapes { }

// ============================================================================
//  Alphabet & special symbols
// ============================================================================
stringdef alef       '{U+0627}'
stringdef aa         '{U+0622}'
stringdef be         '{U+0628}'
stringdef pe         '{U+067E}'
stringdef te         '{U+062A}'
stringdef se         '{U+062B}'
stringdef jim        '{U+062C}'
stringdef che        '{U+0686}'
stringdef heh_jimi   '{U+062D}'
stringdef khe        '{U+062E}'
stringdef dal        '{U+062F}'
stringdef zal        '{U+0630}'
stringdef re         '{U+0631}'
stringdef ze         '{U+0632}'
stringdef zhe        '{U+0698}'
stringdef sin        '{U+0633}'
stringdef shin       '{U+0634}'
stringdef sad        '{U+0635}'
stringdef zad        '{U+0636}'
stringdef ta         '{U+0637}'
stringdef za         '{U+0638}'
stringdef ain        '{U+0639}'
stringdef ghain      '{U+063A}'
stringdef fe         '{U+0641}'
stringdef ghaf       '{U+0642}'
stringdef kaf        '{U+06A9}'
stringdef gaf        '{U+06AF}'
stringdef lam        '{U+0644}'
stringdef mim        '{U+0645}'
stringdef nun        '{U+0646}'
stringdef vav        '{U+0648}'
stringdef heh        '{U+0647}'
stringdef ye         '{U+06CC}'

stringdef space             '{U+0020}'
stringdef zero_width_joiner '{U+200D}'
stringdef half_space        '{U+200C}' // zero-width non-joiner (ZWNJ)
// Note: U+200C (ZERO WIDTH NON-JOINER) is commonly referred to as "nim-faseleh" ("half-space") in Persian.
// Although this name is not standard in English Unicode terminology, it reflects its widespread use
// in Persian as a morpheme separator within words. This character is here to normalize

// Arabic variants to normalize to Persian forms
stringdef ar_kaf                   '{U+0643}'
stringdef ar_ye                    '{U+064A}'
stringdef ar_ye_with_hamza_above   '{U+0626}'
stringdef ar_heh                   '{U+06C1}'
stringdef ar_teh_marbuta           '{U+0629}'
stringdef ar_alef_with_hamza_above '{U+0623}'
stringdef ar_alef_with_hamza_below '{U+0625}'
stringdef ar_vav_with_hamza_above  '{U+0624}'

// ============================================================================
//  Declarations (routines, flags, and marks)
// ============================================================================
routines (
  Normalize_Characters  // Forward: Unicode/script normalization
  Prefixes              // Forward: handles prefixes
  Delete_ZWNJ           // Forward: delete remaining ZWNJs after prefix handling
  Protect_Lexical_AN    // Probe: flag lexical -AN endings (per pass)

  R1 // Test if cursor is in R1

  // Noun tier (HPS categories)
  AN_Exception
  Irregular_Noun
  Stem_Noun_or_Adjective
  Stem_Verb
)

externals ( stem )

// Guard flags:
//  - 'saw_present_prefix':
//    true when a present-tense prefix (mi-/nemi-) is stripped; used as the
//    default for `remove_verb_person_endings` for all suffix removal passes.
//  - 'remove_verb_person_endings':
//    true if a verb cue has been seen; used to enable verb person-ending rules
//    in this pass.
booleans ( saw_present_prefix remove_verb_person_endings )

integers ( p1 )

// ============================================================================
//  PHASE 1 — NORMALIZATION (forward)
//  - Unify Arabic forms to Persian letters for stable downstream matching.
//  - Delete ZWJ (U+200D) and ASCII spaces so the token is solid.
//  - ZWNJ (U+200C) is preserved here; used as prefix-stem separator in Phase 2,
//    then deleted by Delete_ZWNJ after prefix stripping is complete.
// ============================================================================
define Normalize_Characters as (
  repeat (
    [substring] among (
      '{ar_kaf}'                                                 (<- '{kaf}')
      '{ar_ye}' '{ar_ye_with_hamza_above}'                       (<- '{ye}')
      '{ar_teh_marbuta}' '{ar_heh}'                              (<- '{heh}')
      '{ar_alef_with_hamza_above}' '{ar_alef_with_hamza_below}'  (<- '{alef}')
      '{ar_vav_with_hamza_above}'                                (<- '{vav}')
      '{zero_width_joiner}'                                      ( delete )
      '{space}'                                                  ( delete )
      ''                                                         ( next )
    )
  )
)

// ============================================================================
//  PHASE 2 — PREFIX HANDLING (forward, anchored)
//  - HPS: handle known prefixes before suffix processing.
//  - Requires ZWNJ (U+200C) between prefix and stem to avoid mishandling
//    words that begin with the same syllables but are not prefixed forms.
//  - At least two characters are required after the prefix.
//  - Sets `saw_present_prefix` when matching {mi}/{nemi} to enable person endings later.
//  - Note: negating derivational prefixes na- and bi- are intentionally not
//    stripped as they create words with opposite meanings (e.g. na+dorost =
//    incorrect, bi+khatr = safe) and conflating them with their base forms
//    would harm search precision.
//  - Similarly, nemi- negates the meaning so we don't remove it but do still
//    set `saw_present_prefix` when it is present.
// ============================================================================
define Prefixes as (
  [substring] among (
    '{nun}{mim}{ye}{half_space}'  // NEMI‌ + ZWNJ: detect but keep
        (hop 2  set saw_present_prefix)
    '{mim}{ye}{half_space}'       // MI‌ + ZWNJ
        (hop 2  delete  set saw_present_prefix)
  )
)

// ============================================================================
//  PHASE 2b — DELETE REMAINING ZWNJs (forward)
//  - After prefix detection, any remaining ZWNJs (e.g. in compound words)
//    are no longer needed and are removed here.
// ============================================================================
define Delete_ZWNJ as (
  repeat (
    goto (['{half_space}'] delete)
  )
)

backwardmode (

  define R1 as $p1 <= cursor

// ============================================================================
//  PROBE — PROTECT LEXICAL "-AN" (not plural)
//  - HPS strips -AN as a plural; however, many stems end in orthographic "AN":
//      ...stan / ...san / ...ran (place names/lexical stems), and frequent
//      stems such as Iran/Tehran/Ensan/Ostan.
//  - We signal f if a trailing pattern indicates lexical AN.
//  - This is a probe only: it NEVER edits the buffer; it only signals.
// ============================================================================
  define Protect_Lexical_AN as (
    not AN_Exception
    not among (
      '{sin}{te}{alef}{nun}'  // ...stan
      '{sin}{alef}{nun}'      // ...san
      '{re}{alef}{nun}'       // ...ran
      '{vav}{alef}{nun}'      // ...van: onvaan, divaan, javaan, ravaan
    )
  )

// ============================================================================
//  PHASE 3 — SUFFIX STRIPPING (backward)
//  - All routines in backward mode start matching from the end of the token.
//  - IMPORTANT: no successful "no-op" arms; every success must modify the text.
//    This ensures 'repeat' loops terminate.
//  - Most suffixes are only removed/replaced if they are entirely in R1 to
//    avoid over-stemming short words.
// ============================================================================

  // --- Lexical "-AN" suffix list.
  //     Small curated set of stems ending in orthographic -AN that are not plural.
  //     Used by `Protect_Lexical_AN` to prevent stripping on place names / lexical nouns.
  define AN_Exception as (
    among (
      '{aa}{lam}{mim}{alef}{nun}'                     // alman
      '{aa}{sin}{mim}{alef}{nun}'                     // asman
      '{alef}{ye}{mim}{alef}{nun}'                    // eiman
      '{alef}{ye}{shin}{alef}{nun}'                   // ishan
      '{alef}{mim}{kaf}{alef}{nun}'                   // emkan
      '{alef}{sad}{fe}{heh}{alef}{nun}'               // esfahan
      '{aa}{zal}{re}{be}{alef}{ye}{jim}{alef}{nun}'   // azerbaijan
      '{be}{ye}{alef}{nun}'                           // bayan
      '{pe}{alef}{ye}{alef}{nun}'                     // payan
      '{pe}{ye}{mim}{alef}{nun}'                      // payman
      '{jim}{re}{ye}{alef}{nun}'                      // jaryan
      '{dal}{re}{mim}{alef}{nun}'                     // darman
      '{re}{mim}{alef}{nun}'                          // roman
      '{ze}{nun}{dal}{alef}{nun}'                     // zendan
      '{sin}{alef}{ze}{mim}{alef}{nun}'               // sazman
      '{sin}{lam}{ta}{alef}{nun}'                     // soltan
      '{gaf}{ye}{lam}{alef}{nun}'                     // Gilan
      '{ghaf}{heh}{re}{mim}{alef}{nun}'               // ghahramaan
      '{kaf}{re}{mim}{alef}{nun}'                     // Kerman
      '{khe}{alef}{nun}{dal}{alef}{nun}'              // khandan
      '{lam}{be}{nun}{alef}{nun}'                     // lobnan
      '{mim}{ye}{ze}{alef}{nun}'                      // mizan
      '{mim}{sin}{lam}{mim}{alef}{nun}'               // mosalman
      '{nun}{shin}{alef}{nun}'                        // neshan
      '{heh}{mim}{dal}{alef}{nun}'                    // hamedan
      '{ye}{vav}{nun}{alef}{nun}'                     // yunan
      '{kaf}{heh}{kaf}{shin}{alef}{nun}'              // kahkeshan (galaxy)
      '{aa}{te}{shin}{fe}{shin}{alef}{nun}'           // atashfeshan (volcano)
      '{pe}{re}{ye}{shin}{alef}{nun}'                 // perishan (confused)
      '{dal}{re}{khe}{shin}{alef}{nun}'               // darakhshan (shining)
      '{heh}{mim}{ze}{mim}{alef}{nun}'                // hamzaman (simultaneous)
      '{sin}{alef}{khe}{te}{mim}{alef}{nun}'          // sakhteman (building)
      '{sin}{lam}{ye}{mim}{alef}{nun}'                // soleyman (Solomon)
          ( atlimit )
    )
  )

  // --- Noun irregular rewrites.
  //     Only include entries that actually change the buffer.
  define Irregular_Noun as (
    [substring] among (
      '{alef}{khe}{be}{alef}{re}'      (<- '{khe}{be}{re}')
      '{alef}{sin}{alef}{te}{ye}{dal}' (<- '{alef}{sin}{te}{alef}{dal}')
    )
  )

  // --- Noun/Adjective step.
  define Stem_Noun_or_Adjective as (
    Irregular_Noun or
    setlimit tomark p1 for (
      [substring] among (
        // Noun possessive suffixes
        '{alef}{mim}'       (delete) // -AM
        '{alef}{shin}'      (delete) // -ASH

        // -MAN/-TAN/-SHAN are intentionally disabled: these 3-char suffixes
        // are ambiguous with plurals of nouns whose root ends in the same letter:
        //   ketab+maan (our book)   -> strip -maan -> ketab  (correct)
        //   mardom+an  (people pl.) -> strip -an   -> mardom (correct, NOT mardom->mard)
        //   naqqash+an (painters)   -> strip -an   -> naqqash (correct, NOT ->naqqaa)
        //   derakht+an (trees)      -> strip -an   -> derakht (correct, NOT ->derakh)
        // Without a lexicon we cannot distinguish the two cases. The -AN
        // rule below handles both at the cost of missing possessive stripping.
        //        '{mim}{alef}{nun}'  (delete) // -MAN
        //        '{te}{alef}{nun}'   (delete) // -TAN
        //        '{shin}{alef}{nun}' (delete) // -SHAN

        // Noun plurals.
        // -YAN and -GAN are productive.
        '{ye}{alef}{nun}'   (delete) // -YAN
        '{gaf}{alef}{nun}'  (delete) // -GAN
        '{heh}{alef}{ye}'   (delete) // -HAY
        '{alef}{nun}{ye}'   (delete) // -ANI
        '{heh}{alef}'       (delete) // -HA
        '{alef}{te}'        (delete) // -AT (Arabic sound plural; can also be possessive "your")
        '{alef}{nun}'       (delete) // -AN (only if not lexical)
        '{ye}{nun}'         (delete) // -IN (also adjective derivation ending)

        // Other derivational noun endings (conservative set).
        '{gaf}{alef}{heh}'  (delete) // -GAH
        '{be}{alef}{nun}'   (delete) // -BAN
        '{gaf}{ye}'         (delete) // -GI (abstract noun)
        '{ye}{te}'          (delete) // -YAT
        '{ye}{ye}'          (delete) // -YY (double Y; also adj. relative -Y)

        // Adjective: comparative/superlative (HPS).
        '{te}{re}{ye}{nun}' (delete) // -TARIN
        '{te}{re}'          (not atlimit delete) // -TAR

        // Adjective derivational endings (HPS list + common variants).
        '{alef}{nun}{heh}'  (delete) // -ANE
        '{mim}{nun}{dal}'   (delete) // -MAND
        '{vav}{alef}{re}'   (delete) // -VAR
        '{nun}{alef}{kaf}'  (delete) // -NAK
        '{gaf}{alef}{re}'   (delete) // -GAR
      )
    )
  )

  // --- Verb step.
  define Stem_Verb as (
    (
      // Participle + person/aux clitic tails
      [substring] among (
        '{alef}{ye}{dal}'
        '{alef}{ye}{mim}'
        '{alef}{nun}{dal}'
        '{alef}{sin}{te}'
        '{alef}{sin}'
        '{alef}{ye}'
        '{ye}{dal}'
        '{ye}{mim}'
            (R1 delete)
      )
    ) or (
      [substring] among (
        // Generic person and singular/plural endings — only when a verb cue is active
        '{alef}{nun}{dal}'  // -AND
        '{ye}{dal}'         // -ID
        '{ye}{mim}'         // -IM
        '{alef}{mim}'       // -AM
        '{dal}'             // -D
        '{mim}'             // -M
            ( remove_verb_person_endings R1 delete )

        // Specific past-3sg stem fixes
        '{re}{fe}{te}{mim}'
        '{re}{fe}{te}{ye}'
        '{re}{fe}{te}{ye}{mim}'
        '{re}{fe}{te}{ye}{dal}'
        '{re}{fe}{te}{alef}{nun}{dal}'
            (<- '{re}{fe}{te}')

        // Mood/tense markers (HPS).
        '{nun}{dal}{heh}'   // agent noun -NDEH → present root
        '{alef}{nun}'       // infinitive -N / -AN
            (R1 delete set remove_verb_person_endings)
        '{dal}{heh}'        // past part. -deh
            (not atlimit <-'{dal}' set remove_verb_person_endings)
        '{te}{heh}'         // past part. -teh
            (not atlimit <-'{te}'  set remove_verb_person_endings)
      )
    )
  )
)

// ============================================================================
//  MAIN (HPS pipeline)
// ============================================================================
define stem as (
  unset saw_present_prefix

  // 1) Normalize script/spacing (ZWNJ preserved for prefix detection)
  do Normalize_Characters

  // 2) Handle leading prefixes (requires ZWNJ separator)
  do Prefixes

  // 2b) Delete remaining ZWNJs now that prefix detection is done
  do Delete_ZWNJ

  // Set p1 to be 3 characters into the string.
  $p1 = limit
  do ( hop 3 setmark p1 )

  // 3) Remove suffixes.  Each pass removes one suffix, trying noun and
  // adjective endings first, then verb endings.  So if we remove a noun
  // or adjective ending, we next check for another noun or adjective
  // ending.
  //
  // Note: The loop exits if no suffix is removed and the buffer gets
  // shorter when a suffix is removed so we can't loop forever.
  backwards repeat test (
    // Set remove_verb_person_endings = saw_present_prefix.
    unset remove_verb_person_endings
    try ( saw_present_prefix set remove_verb_person_endings )

    Protect_Lexical_AN
    // HPS as described in the paper uses a POS tagger.  We don't have that
    // information so we use a heuristic and try noun/adjective endings first,
    // then verb endings.
    Stem_Noun_or_Adjective or Stem_Verb
  )
)