#! /usr/bin/env sh

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# Sanity checks for the regenerated en_US-mozilla dictionary, run after
# make-new-dict.sh and before install-new-dict.sh. Written for POSIX sh
# so it runs unchanged on macOS (BSD userland) and Linux.

set -e

WKDIR="`pwd`"
DICT="$WKDIR/en_US-mozilla.dic"
AFF="$WKDIR/en_US-mozilla.aff"
BASELINE_DIC="$WKDIR/utf8/en-US-utf8.dic"
BASELINE_AFF="$WKDIR/utf8/en-US-utf8.aff"
MOZ_SPECIFIC="$WKDIR/mozilla-specific.txt"
SCOWL_DIR="$WKDIR/scowl"
MUNCH_LIST="$SCOWL_DIR/speller/munch-list"
MOZ_REMOVED="$WKDIR/5-mozilla-removed.txt"
WORDLIST_DIFF_URL_BASE="https://raw.githubusercontent.com/en-wl/wordlist-diff"

if [ ! -f "$DICT" ] || [ ! -f "$AFF" ]; then
  echo "ERROR: $DICT or $AFF not found. Run make-new-dict.sh first."
  exit 1
fi
if [ ! -f "$BASELINE_DIC" ] || [ ! -f "$BASELINE_AFF" ]; then
  echo "ERROR: baseline files missing under $WKDIR/utf8/."
  exit 1
fi

errors=0
warnings=0

fail() {
  printf 'FAIL: %s\n' "$1"
  errors=$((errors + 1))
}

warn() {
  printf 'WARN: %s\n' "$1"
  warnings=$((warnings + 1))
}

ok() {
  printf 'OK:   %s\n' "$1"
}

printf '\n=== 1. ISO-8859-1 round-trip ===\n'
if iconv -f utf-8 -t iso-8859-1 < "$DICT" > /dev/null 2>/dev/null; then
  ok "Dictionary fits in ISO-8859-1"
else
  fail "Dictionary contains characters outside ISO-8859-1 (install-new-dict.sh would mangle them)"
fi

printf '\n=== 2. Mozilla-specific words preserved ===\n'
while IFS= read -r line; do
  case "$line" in
    ''|'#'*) continue ;;
  esac
  word=${line%%/*}
  if grep -qE "^${word}(\$|/)" "$DICT"; then
    ok "$word"
  else
    fail "Missing Mozilla-specific word: $word"
  fi
done < "$MOZ_SPECIFIC"

printf '\n=== 3. Suggestion exclusions preserved ===\n'
TMPD="${TMPDIR:-/tmp}"
old_nosug="$TMPD/old-nosug-$$"
new_nosug="$TMPD/new-nosug-$$"
trap 'rm -f "$old_nosug" "$new_nosug"' EXIT
grep '!$' "$BASELINE_DIC" | LC_ALL=C sort > "$old_nosug"
grep '!$' "$DICT"         | LC_ALL=C sort > "$new_nosug"
missing=`comm -23 "$old_nosug" "$new_nosug"`
added=`comm -13 "$old_nosug" "$new_nosug"`
if [ -z "$missing" ]; then
  ok "All previous suggestion exclusions preserved"
else
  fail "Missing suggestion exclusions:"
  printf '%s\n' "$missing" | sed 's/^/  /'
fi
if [ -n "$added" ]; then
  printf 'INFO: New suggestion exclusions:\n'
  printf '%s\n' "$added" | sed 's/^/  /'
fi

printf '\n=== 4. Diff stats ===\n'
old_lines=`wc -l < "$BASELINE_DIC" | tr -d ' '`
new_lines=`wc -l < "$DICT" | tr -d ' '`
delta=$((new_lines - old_lines))
abs=${delta#-}
case $delta in
  -*) delta_str=$delta ;;
  *)  delta_str="+$delta" ;;
esac
if [ "$old_lines" -gt 0 ]; then
  pct=$((abs * 100 / old_lines))
else
  pct=0
fi
printf 'Baseline lines: %s\n' "$old_lines"
printf 'New lines:      %s (delta %s, %s%%)\n' "$new_lines" "$delta_str" "$pct"
if [ "$pct" -gt 25 ]; then
  warn "Line count changed by more than 25% of the baseline; double-check the output"
fi

printf '\n=== 5. Upstream en_US.txt subset check ===\n'
# The Mozilla dictionary should equal upstream en_US.txt minus Mozilla
# removals, plus Mozilla additions, variants and accented words. So every
# word in upstream en_US.txt that isn't in 5-mozilla-removed.txt should be
# present in the regenerated wordlist obtained by expanding en_US-mozilla.dic
# through its affix file.
#
# The reference en_US.txt lives in the wordlist-diff mirror, which carries
# the same release tags as SCOWL itself. We only need that one file, so
# fetch it directly from raw.githubusercontent.com instead of requiring a
# clone of the repo.
scowl_version=`git -C "$SCOWL_DIR" describe --tags --exact-match 2>/dev/null || true`
if [ -z "$scowl_version" ]; then
  warn "$SCOWL_DIR is not on a tagged release; skipping upstream subset check."
elif ! command -v curl >/dev/null 2>&1; then
  warn "curl not available; skipping upstream subset check."
elif [ ! -x "$MUNCH_LIST" ]; then
  warn "$MUNCH_LIST not available; skipping upstream subset check."
elif [ ! -f "$MOZ_REMOVED" ]; then
  warn "$MOZ_REMOVED not found; run make-new-dict.sh first."
else
  upstream_raw="$TMPD/wordlist-diff-en_US-$$.txt"
  upstream_sorted="$TMPD/upstream-$$"
  final_wordlist="$TMPD/final-wordlist-$$"
  removed_sorted="$TMPD/removed-$$"
  expected_subset="$TMPD/expected-$$"
  unexpected_missing="$TMPD/unexpected-$$"
  trap 'rm -f "$old_nosug" "$new_nosug" "$upstream_raw" "$upstream_sorted" "$final_wordlist" "$removed_sorted" "$expected_subset" "$unexpected_missing"' EXIT

  url="$WORDLIST_DIFF_URL_BASE/$scowl_version/en_US.txt"
  printf 'Fetching %s ...\n' "$url"
  if ! curl -fsSL "$url" -o "$upstream_raw"; then
    warn "Could not download $url; skipping upstream subset check."
  else
    # Expand the regenerated dictionary through its affix file to get the
    # full wordlist. The .dic is UTF-8 at this point; munch-list operates
    # on ISO-8859-1, so pipe it through iconv. Strip the count line at
    # the top (only digits).
    iconv -f utf-8 -t iso-8859-1 "$DICT" \
      | grep -v '^[0-9]\+$' \
      | LC_ALL=C "$MUNCH_LIST" expand "$AFF" \
      | LC_ALL=C sort -u > "$final_wordlist"

    # Normalize the upstream baseline and Mozilla removals to ISO-8859-1
    # to match the regenerated wordlist. Drop any characters that can't
    # be represented (they can't be in the shipped .dic either; check 1
    # catches that case separately).
    iconv -f utf-8 -t iso-8859-1//TRANSLIT "$upstream_raw" 2>/dev/null | LC_ALL=C sort -u > "$upstream_sorted"
    iconv -f utf-8 -t iso-8859-1//TRANSLIT "$MOZ_REMOVED" 2>/dev/null | LC_ALL=C sort -u > "$removed_sorted"

    # Drop words Mozilla intentionally removed from the upstream baseline.
    LC_ALL=C comm -23 "$upstream_sorted" "$removed_sorted" > "$expected_subset"
    # Anything still missing from the regenerated wordlist is unexpected.
    LC_ALL=C comm -23 "$expected_subset" "$final_wordlist" > "$unexpected_missing"

    if [ ! -s "$unexpected_missing" ]; then
      ok "Upstream en_US.txt at $scowl_version is a subset of the regenerated wordlist (minus Mozilla removals)"
    else
      total=`wc -l < "$unexpected_missing" | tr -d ' '`
      fail "$total upstream words from $scowl_version missing from the regenerated wordlist and not in 5-mozilla-removed.txt:"
      head -n 20 "$unexpected_missing" | sed 's/^/  /'
      if [ "$total" -gt 20 ]; then
        printf '  ... and %s more\n' $((total - 20))
      fi
    fi
  fi
fi

printf '\n=== Summary ===\n'
printf 'Errors: %d  Warnings: %d\n' "$errors" "$warnings"
if [ "$errors" -gt 0 ]; then
  exit 1
fi
exit 0