--- # SemanticRouter example with complexity-aware routing # This example demonstrates few-shot complexity classification for intelligent routing # Route simple queries to fast models and complex queries to powerful models apiVersion: vllm.ai/v1alpha1 kind: SemanticRouter metadata: name: semantic-router-complexity namespace: default spec: # Basic deployment settings replicas: 1 # Configure vLLM backend endpoints with different reasoning families vllmEndpoints: # Fast model for simple queries - name: llama-8b-fast model: llama3-8b reasoningFamily: qwen3 backend: type: kserve inferenceServiceName: llama-3-8b weight: 2 # Prefer this for simple queries # Powerful model for complex queries - name: llama-70b-reasoning model: llama3-70b reasoningFamily: deepseek backend: type: kserve inferenceServiceName: llama-3-70b weight: 1 # Persistence for model cache persistence: enabled: true size: 10Gi # Resource allocation resources: requests: cpu: 2000m memory: 4Gi limits: cpu: 4000m memory: 8Gi # Semantic router configuration config: # Embedding models for classification embedding_models: qwen3_model_path: "models/mom-embedding-pro" use_cpu: true hnsw_config: model_type: "qwen3" preload_embeddings: true enable_soft_matching: true min_score_threshold: "0.5" # Complexity rules for intelligent routing complexity_rules: # Rule 1: Code complexity classification - name: "code-complexity" description: "Classify coding tasks by complexity" threshold: "0.3" # Lower threshold works better for embedding-based similarity # Hard examples: complex coding tasks hard: candidates: - "Implement a distributed lock manager with leader election" - "Design a database migration system with rollback support" - "Create a compiler optimization pass for loop unrolling" - "Build a graph algorithm for strongly connected components" - "Develop a custom memory allocator with fragmentation handling" # Easy examples: simple coding tasks easy: candidates: - "Write a function to reverse a string" - "Create a class to represent a rectangle" - "Implement a simple counter with increment/decrement" - "Write a function to check if a number is even" - "Create a basic TODO list with add/remove operations" # Rule 2: Reasoning complexity classification - name: "reasoning-complexity" description: "Classify reasoning and problem-solving tasks" threshold: "0.3" # Lower threshold works better for embedding-based similarity # Hard examples: complex reasoning hard: candidates: - "Analyze the geopolitical implications of renewable energy adoption" - "Evaluate the ethical considerations of AI in healthcare decision-making" - "Design a multi-stage marketing strategy for a new product launch" - "Explain quantum entanglement and its applications in cryptography" - "Propose solutions for traffic congestion in urban environments" # Easy examples: simple reasoning easy: candidates: - "What is the capital of France?" - "How many days are in a week?" - "What color is the sky on a clear day?" - "Name three common pets" - "What is 5 + 3?" # Rule 3: Domain-specific complexity (with composer for conditional application) - name: "medical-complexity" description: "Classify medical queries by complexity (only for medical domain)" threshold: "0.3" # Lower threshold works better for embedding-based similarity # Hard examples: complex medical scenarios hard: candidates: - "Differential diagnosis for chest pain with dyspnea and elevated troponin" - "Treatment protocol for multi-drug resistant tuberculosis" - "Surgical approach for complex congenital heart defect repair" - "Management of acute liver failure with hepatic encephalopathy" - "Immunotherapy selection for metastatic melanoma" # Easy examples: simple medical queries easy: candidates: - "What is the normal body temperature?" - "How often should I brush my teeth?" - "What are common symptoms of a cold?" - "How much water should I drink daily?" - "What is a fever?" # Composer: only apply this rule if domain is medical composer: operator: "AND" conditions: - type: "domain" name: "medical" # Semantic cache configuration semantic_cache: enabled: true backend_type: "memory" embedding_model: "qwen3" similarity_threshold: "0.85" max_entries: 5000 ttl_seconds: 7200 # Prompt guard prompt_guard: enabled: true threshold: "0.7" # Tools configuration tools: enabled: true top_k: 3 # Default reasoning effort for queries without complexity classification default_reasoning_effort: "medium"