---
# SemanticRouter example using mmBERT 2D Matryoshka embeddings
# This example demonstrates layer early exit and dimension reduction for optimal performance
# mmBERT provides multilingual support with adaptive quality/speed trade-offs
apiVersion: vllm.ai/v1alpha1
kind: SemanticRouter
metadata:
  name: semantic-router-mmbert
  namespace: default
spec:
  # Basic deployment settings
  replicas: 1

  # Configure vLLM backend endpoints
  vllmEndpoints:
    - name: my-llama-model
      model: llama3-8b
      reasoningFamily: qwen3
      backend:
        type: kserve
        inferenceServiceName: llama-3-8b

  # Persistence for model cache
  persistence:
    enabled: true
    size: 10Gi

  # Basic resource limits
  resources:
    requests:
      cpu: 2000m
      memory: 4Gi
    limits:
      cpu: 4000m
      memory: 8Gi

  # Semantic router configuration
  config:
    # Embedding models configuration with mmBERT 2D Matryoshka
    embedding_models:
      # Path to mmBERT model (downloads automatically on startup, or run: make download-models)
      mmbert_model_path: "models/mmbert-embedding"
      use_cpu: true

      # HNSW configuration for embedding-based classification
      hnsw_config:
        # Use mmBERT model
        model_type: "mmbert"

        # Layer early exit: balance between speed and accuracy
        # Layer 3: ~7x speedup (good for high-volume, simple queries)
        # Layer 6: ~3.6x speedup (balanced - recommended for most use cases)
        # Layer 11: ~2x speedup (higher accuracy)
        # Layer 22: full model (maximum accuracy)
        target_layer: 6

        # Dimension reduction: lower dimensions = faster similarity search
        # Supported: 64, 128, 256, 512, 768
        # 256 is recommended for balanced performance/quality
        target_dimension: 256

        preload_embeddings: true
        enable_soft_matching: true
        min_score_threshold: "0.5"

    # Semantic cache using mmBERT embeddings
    semantic_cache:
      enabled: true
      backend_type: "memory"

      # Use mmBERT for semantic similarity in cache
      embedding_model: "mmbert"
      similarity_threshold: "0.85"
      max_entries: 5000
      ttl_seconds: 7200
      eviction_policy: "lru"

      # HNSW indexing for fast similarity search
      hnsw:
        use_hnsw: true
        hnsw_m: 16
        hnsw_ef_construction: 200
        max_memory_entries: 5000

    # Prompt guard (jailbreak detection)
    prompt_guard:
      enabled: true
      threshold: "0.7"

    # Tools auto-selection
    tools:
      enabled: true
      top_k: 3