/**
 * Local Quantized Model with Tuned Sampling (vLLM / llama-server)
 *
 * Demonstrates the sampling parameters exposed in v1.3+ — `topK`, `minP`,
 * `frequencyPenalty`, `presencePenalty`, `parallelToolCalls`, and the
 * `extraBody` escape hatch — applied to a quantized MoE model served by an
 * OpenAI-compatible local inference server.
 *
 * Why this example exists:
 *   Highly quantized MoE models (e.g. Qwen2.5-MoE @ Q4, DeepSeek-MoE @ Q4)
 *   on consumer hardware tend to fall into repetition loops or hallucinate
 *   tool-call schemas when sampling defaults are too permissive. The knobs
 *   below clamp the distribution and discourage repetition. Cloud OpenAI
 *   users do not need any of these — defaults are tuned for full-precision
 *   models. This is a local-quantized concern.
 *
 * Run:
 *   no_proxy=localhost npx tsx examples/providers/local-quantized.ts
 *
 * Prerequisites — pick one OpenAI-compatible local server:
 *   • vLLM:        `vllm serve Qwen/Qwen2.5-7B-Instruct-AWQ --port 8000`
 *   • llama-server: `llama-server -m model.gguf --port 8080`
 *   • LM Studio, Ollama, etc. — any server that accepts OpenAI chat
 *     completions on a local port.
 *   Then update LOCAL_BASE_URL and LOCAL_MODEL below.
 *
 * Provider compatibility note (per `AgentConfig` JSDoc):
 *   • topP                                 — universal
 *   • topK                                 — Anthropic + OpenAI-compatible local
 *   • minP                                 — OpenAI-compatible local only
 *   • frequencyPenalty / presencePenalty  — OpenAI track only
 *   • parallelToolCalls                    — OpenAI track only; set `false` for
 *                                            local servers that mishandle
 *                                            concurrent tool_call deltas
 *   • extraBody                            — adapter-specific escape hatch
 *
 * Cloud OpenAI rejects `top_k` and `min_p`; this example is not portable to
 * `api.openai.com`.
 */

import { OpenMultiAgent } from '../../src/index.js'
import type { AgentConfig } from '../../src/types.js'

// ---------------------------------------------------------------------------
// Configuration — adjust to your local server
// ---------------------------------------------------------------------------

const LOCAL_BASE_URL = 'http://localhost:8000/v1'    // vLLM default; llama-server: 8080, LM Studio: 1234
const LOCAL_MODEL = 'Qwen/Qwen2.5-7B-Instruct-AWQ'   // any model your local server has loaded

// ---------------------------------------------------------------------------
// Agent — each sampling knob is annotated with the loop / hallucination
// failure mode it counters on quantized models.
// ---------------------------------------------------------------------------

const assistant: AgentConfig = {
  name: 'assistant',
  model: LOCAL_MODEL,
  provider: 'openai',
  baseURL: LOCAL_BASE_URL,
  apiKey: 'local',  // placeholder — local servers ignore this, but the OpenAI SDK requires a non-empty value

  // Standard sampling — sane defaults for instruction-tuned quantized models.
  temperature: 0.7,
  maxTokens: 1024,

  // Nucleus + top-k jointly clamp the candidate pool. On Q4 quants the raw
  // distribution often has long, noisy tails that produce off-topic tokens.
  topP: 0.95,
  topK: 40,

  // Min-p drops any token whose probability is below `minP * max_prob`. Cuts
  // the tail more aggressively than top-k alone. vLLM / llama-server expose
  // this; cloud OpenAI does not.
  minP: 0.05,

  // Frequency penalty discourages repeated tokens — the most common failure
  // mode is the model getting stuck emitting the same tool_call schema or
  // sentence fragment over and over. Value range is -2..2; 0.3 is a mild
  // nudge that does not noticeably hurt fluency.
  frequencyPenalty: 0.3,

  // Presence penalty is usually 0 unless you specifically want to push the
  // model toward novel topics. Left here to make the contract explicit.
  presencePenalty: 0,

  // extraBody: adapter-specific escape hatch. vLLM and llama-server accept a
  // `repetition_penalty` parameter that is not in the OpenAI spec — it
  // multiplies logits of recently emitted tokens. Slightly redundant with
  // frequencyPenalty but operates on logits instead of token counts, so the
  // two compose. Anything you put here is merged into the request body and
  // can override standard sampling fields, but cannot override transport
  // fields (`model`, `messages`, `tools`, `stream`).
  extraBody: {
    repetition_penalty: 1.05,
  },

  systemPrompt: `You are a concise assistant. Answer in one short paragraph.
Do not repeat yourself.`,

  maxTurns: 4,
}

// ---------------------------------------------------------------------------
// Run
// ---------------------------------------------------------------------------

const orchestrator = new OpenMultiAgent({
  defaultModel: LOCAL_MODEL,
  maxConcurrency: 1,  // most local servers serve one request at a time
})

console.log(`Calling ${LOCAL_MODEL} at ${LOCAL_BASE_URL}`)
console.log('Sampling: topP=0.95 topK=40 minP=0.05 freqPenalty=0.3 + repetition_penalty=1.05\n')

const result = await orchestrator.runAgent(
  assistant,
  'In one paragraph, explain what min-p sampling does and when it helps.',
)

console.log('--- response ---')
console.log(result.output)
console.log()
console.log(`tokens: ${result.tokenUsage.input_tokens} in / ${result.tokenUsage.output_tokens} out`)