# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import json
import os
from abc import ABC, abstractmethod
from typing import Any, Callable

import requests


class _Evaluation(ABC):
    """
    The abstract base class for an evaluation that is run by mozperftest.
    See python/mozperftest/mozperftest/metrics/eval.py
    """

    # Include a list of requirements that will be pip installed via the test harness.
    # Requirements should not require a build, and should have the appropriate .whl
    # files for reproducibility.
    requirements: list[str] = []

    def __init__(self, log: Callable[[str], None], config: dict[str, Any]) -> None:
        self.log = log
        self.config = config

    @abstractmethod
    def run(self, payloads: list[dict[str, Any]]) -> dict:
        """Run the evaluation and return a perftest metric result."""
        ...


class _LlmJudge(_Evaluation):
    """
    Use the Mozilla LLM Proxy Auth (MLPA) endpoint to run an LLM as a judge.
    """

    def __init__(self, log: Callable[[str], None], config: dict[str, Any]) -> None:
        super().__init__(log, config)
        self.endpoint = config.get(
            "endpoint",
            "https://mlpa-prod-prod-mozilla.global.ssl.fastly.net/v1/chat/completions",
        )
        self.model = config.get("model", "vertex_ai/mistral-small-2503")
        self.token = os.environ.get("MOZ_FXA_BEARER_TOKEN")

    def query_llm(self, messages: list[Any]):
        if not self.token:
            raise RuntimeError("Missing MOZ_FXA_BEARER_TOKEN for LLM evaluation.")
        resp = requests.post(
            self.endpoint,
            headers={
                "authorization": f"Bearer {self.token}",
                "content-type": "application/json",
                "service-type": "ai",
            },
            json={
                "model": self.model,
                "messages": messages,
                "stream": False,
            },
            timeout=30,
        )

        resp.raise_for_status()
        return resp.json()


class _TranslationsSacreBleu(_Evaluation):
    """
    Compute the bleu or chrF (character level f-score) for a translation.
    https://en.wikipedia.org/wiki/BLEU
    https://en.wikipedia.org/wiki/F-score

    Use TranslationsBleu and TranslationsChrf for the respective scores.
    """

    requirements = [
        "sacrebleu==2.4.2",
    ]

    name = ""

    def compute_score(self, trg: str, ref: str) -> float:
        raise NotImplementedError()

    def run(self, payloads: list[dict[str, Any]]):
        results: list[float] = []
        for payload in payloads:
            if "trg" not in payload or "ref" not in payload:
                raise ValueError(f"Missing required translation fields in {payload}")
            trg = payload["trg"]
            ref = payload["ref"]

            results.append(self.compute_score(trg, ref))

        if not results:
            raise ValueError(
                "No evaluation results were produced for translation data."
            )

        return {
            "name": self.name,
            "values": results,
            "lowerIsBetter": True,
        }


class TranslationsBleu(_TranslationsSacreBleu):
    """See _TranslationsSacreBleu for documentation."""

    name = "bleu"

    def compute_score(self, trg: str, ref: str) -> float:
        import sacrebleu

        self.log("Computing the bleu score")
        return sacrebleu.corpus_bleu([trg], [[ref]]).score


class TranslationsChrf(_TranslationsSacreBleu):
    """See _TranslationsSacreBleu for documentation."""

    name = "chrF"

    def compute_score(self, trg: str, ref: str) -> float:
        import sacrebleu

        self.log("Computing the chrF score")
        return sacrebleu.corpus_chrf([trg], [[ref]]).score


class TranslationsLlmJudge(_LlmJudge):
    """
    Judge a translation based on an LLM's judgement.

    Returns:
    {
        "score": int,
        "verdict": str,
        "explanation": str,
        "model": str,
    }

    perfherder_metrics: [
        {
          name: "bleu",
          unit: "bleu",
          lowerIsBetter: false,
          shouldAlert: false,
        },
        {
          name: "chrF",
          unit: "chrF",
          lowerIsBetter: false,
          shouldAlert: false,
        },
    ]
    """

    requirements = []

    def run(self, payloads: list[dict[str, Any]]):
        results: list[dict[str, Any]] = []
        for payload in payloads:
            missing = [key for key in ("src", "trg", "ref") if key not in payload]
            if missing:
                raise ValueError(
                    f"Missing required translation fields {missing} in {payload}"
                )
            src = payload["src"]
            trg = payload["trg"]
            ref = payload["ref"]

            user_prompt = (
                f"Source: {src}Reference: {ref}\nHypothesis: {trg}\n"
                'Return JSON with fields: score (0-100), verdict ("good"|"ok"|"bad"), explanation (short).'
            )

            response = self.query_llm([
                {
                    "role": "system",
                    "content": "You are a translation quality judge. Rate adequacy/fluency.",
                },
                {"role": "user", "content": user_prompt},
            ])

            message = response.get("choices", [{}])[0].get("message", {})
            content = message.get("content", "").strip()

            # Extract the JSON if it's returned with triple backticks.
            if content.startswith("```"):
                lines = content.splitlines()
                content = "\n".join(
                    line for line in lines if not line.strip().startswith("```")
                )

            parsed = json.loads(content)
            score = parsed.get("score")
            if score is None:
                raise ValueError(f"Missing score in LLM judge response: {parsed}")
            if isinstance(score, str):
                try:
                    score = float(score)
                except ValueError as exc:
                    raise ValueError(
                        f"Invalid score value in LLM judge response: {parsed}"
                    ) from exc

            results.append({
                "score": score,
                "verdict": parsed.get("verdict"),
                "explanation": parsed.get("explanation"),
                "model": response.get("model"),
            })

        if not results:
            raise ValueError("No evaluation results were produced for LLM judge data.")

        scores = [result.get("score", 0) for result in results]
        return {
            "name": "llm-judge",
            "values": scores,
            "lowerIsBetter": False,
        }