apiVersion: score.dev/v1b1
metadata:
  name: ml-inference-service
  annotations:
    app.kubernetes.io/component: inference
    app.kubernetes.io/part-of: ml-platform
    ml.company.io/model-name: product-recommendations
    ml.company.io/model-version: "4.2.1"
    ml.company.io/framework: pytorch
    gpu.company.io/required: "true"
service:
  ports:
    http:
      port: 8080
      targetPort: 8080
      protocol: TCP
    grpc:
      port: 8081
      targetPort: 8081
      protocol: TCP
    metrics:
      port: 9090
      targetPort: 9090
      protocol: TCP
    admin:
      port: 8082
      targetPort: 8082
      protocol: TCP
containers:
  model-server:
    image: myacr.azurecr.io/ml-serving/inference-server:4.2.1
    command:
      - python
      - -m
      - inference_server
    args:
      - --config=/etc/model-server/config.yaml
      - --model-path=/models/current
      - --http-port=8080
      - --grpc-port=8081
      - --metrics-port=9090
      - --admin-port=8082
    variables:
      MODEL_NAME: product-recommendations
      MODEL_VERSION: "4.2.1"
      PYTORCH_CUDA_ALLOC_CONF: max_split_size_mb:512
      CUDA_VISIBLE_DEVICES: "0"
      OMP_NUM_THREADS: "4"
      MKL_NUM_THREADS: "4"
      TOKENIZERS_PARALLELISM: "false"
      MODEL_REGISTRY_URL: ${resources.model-registry.url}
      MODEL_REGISTRY_TOKEN: ${resources.model-registry-credentials.token}
      FEATURE_STORE_HOST: ${resources.feature-store.host}
      FEATURE_STORE_PORT: ${resources.feature-store.port}
      CACHE_REDIS_HOST: ${resources.inference-cache.host}
      CACHE_REDIS_PORT: ${resources.inference-cache.port}
      CACHE_TTL_SECONDS: "300"
      MAX_BATCH_SIZE: "32"
      MAX_QUEUE_DELAY_MS: "100"
      REQUEST_TIMEOUT_MS: "5000"
      DB_CONNECTION_STRING: ${resources.prediction-log-db.connection_string}
      OTEL_EXPORTER_OTLP_ENDPOINT: ${resources.observability.otlp_endpoint}
      OTEL_SERVICE_NAME: ml-inference-service
      LOG_LEVEL: INFO
      PYTHONUNBUFFERED: "1"
    files:
      /etc/model-server/config.yaml:
        content: |
          server:
            name: ml-inference-service
            version: 4.2.1
          
          model:
            name: product-recommendations
            version: 4.2.1
            type: pytorch
            inputSchema:
              type: object
              properties:
                user_id:
                  type: string
                product_ids:
                  type: array
                  items:
                    type: string
                context:
                  type: object
            outputSchema:
              type: object
              properties:
                recommendations:
                  type: array
                  items:
                    type: object
                    properties:
                      product_id:
                        type: string
                      score:
                        type: number
                confidence:
                  type: number
          
          inference:
            batchSize: 32
            maxQueueDelay: 100ms
            timeout: 5s
            warmup:
              enabled: true
              samples: 100
          
          caching:
            enabled: true
            ttl: 300s
            keyPrefix: "rec:v4:"
          
          featureStore:
            enabled: true
            features:
              - user_embeddings
              - product_embeddings
              - user_history
              - trending_products
          
          monitoring:
            metrics:
              enabled: true
              histogramBuckets:
                - 0.005
                - 0.01
                - 0.025
                - 0.05
                - 0.1
                - 0.25
                - 0.5
                - 1.0
            logging:
              predictionSampling: 0.01
              errorLogging: true
        mode: "0644"
      /etc/model-server/labels.json:
        source: ./config/labels.json
        mode: "0644"
        noExpand: true
    volumes:
      /models/current:
        source: ${resources.model-storage.source}
        path: product-recommendations/v4.2.1
        readOnly: true
      /models/cache:
        source: ${resources.model-cache.source}
        readOnly: false
      /data/features:
        source: ${resources.feature-cache.source}
        readOnly: false
    resources:
      requests:
        memory: 8Gi
        cpu: "4"
      limits:
        memory: 32Gi
        cpu: "8"
    readinessProbe:
      httpGet:
        path: /v1/health/ready
        port: 8082
        scheme: HTTP
        httpHeaders:
          - name: Accept
            value: application/json
    livenessProbe:
      httpGet:
        path: /v1/health/live
        port: 8082
        scheme: HTTP
  request-router:
    image: envoyproxy/envoy:v1.28.0
    args:
      - --config-path=/etc/envoy/envoy.yaml
      - --log-level=info
    files:
      /etc/envoy/envoy.yaml:
        content: |
          static_resources:
            listeners:
              - name: http_listener
                address:
                  socket_address:
                    address: 0.0.0.0
                    port_value: 8080
                filter_chains:
                  - filters:
                      - name: envoy.filters.network.http_connection_manager
                        typed_config:
                          "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
                          stat_prefix: ingress_http
                          route_config:
                            name: local_route
                            virtual_hosts:
                              - name: inference
                                domains: ["*"]
                                routes:
                                  - match:
                                      prefix: "/v1/predict"
                                    route:
                                      cluster: model_server
                                      timeout: 10s
                                    request_headers_to_add:
                                      - header:
                                          key: X-Request-Start
                                          value: "%START_TIME%"
                          http_filters:
                            - name: envoy.filters.http.router
                              typed_config:
                                "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
            clusters:
              - name: model_server
                connect_timeout: 1s
                type: STATIC
                lb_policy: ROUND_ROBIN
                load_assignment:
                  cluster_name: model_server
                  endpoints:
                    - lb_endpoints:
                        - endpoint:
                            address:
                              socket_address:
                                address: 127.0.0.1
                                port_value: 8080
        mode: "0644"
    resources:
      requests:
        memory: 64M
        cpu: 100m
      limits:
        memory: 256M
        cpu: 500m
    readinessProbe:
      httpGet:
        path: /ready
        port: 8001
        scheme: HTTP
  ab-controller:
    image: myacr.azurecr.io/ml-serving/ab-controller:1.5.0
    variables:
      EXPERIMENT_CONFIG_PATH: /etc/experiments/config.yaml
      MODEL_SERVER_ENDPOINT: http://127.0.0.1:8082
      METRICS_ENDPOINT: ${resources.observability.metrics_endpoint}
      EXPERIMENT_DB_URL: ${resources.experiment-db.connection_string}
    files:
      /etc/experiments/config.yaml:
        content: |
          experiments:
            - name: recommendation-algorithm-v4
              enabled: true
              variants:
                - name: control
                  weight: 80
                  config:
                    algorithm: collaborative-filtering
                - name: treatment
                  weight: 20
                  config:
                    algorithm: transformer-based
              metrics:
                - name: click_through_rate
                  type: ratio
                - name: conversion_rate
                  type: ratio
                - name: latency_p99
                  type: percentile
                  percentile: 99
        mode: "0644"
    resources:
      requests:
        memory: 128M
        cpu: 100m
      limits:
        memory: 256M
        cpu: 250m
    readinessProbe:
      httpGet:
        path: /health
        port: 8090
        scheme: HTTP
    livenessProbe:
      exec:
        command:
          - /bin/sh
          - -c
          - test -f /tmp/healthy
resources:
  model-registry:
    type: service
    id: ml-platform.model-registry
    metadata:
      annotations:
        ml.company.io/component: registry
    params:
      version: "2.0"
  model-registry-credentials:
    type: secret
    id: ml-platform.registry-auth
    params:
      keys:
        - token
        - refreshToken
  feature-store:
    type: service
    class: feast
    id: ml-platform.feature-store
    params:
      offlineStore: bigquery
      onlineStore: redis
  inference-cache:
    type: redis
    class: cluster
    id: ml-platform.inference-cache
    params:
      version: "7.2"
      maxMemoryPolicy: volatile-lru
      maxMemory: 4gb
  prediction-log-db:
    type: postgres
    class: analytics
    id: ml-platform.predictions
    metadata:
      annotations:
        analytics.company.io/retention: "90d"
    params:
      version: "15"
      extensions:
        - timescaledb
      partitioning:
        type: time
        interval: daily
  experiment-db:
    type: postgres
    class: standard
    id: ml-platform.experiments
    params:
      version: "15"
      size: small
  model-storage:
    type: volume
    class: nfs
    id: ml-platform.models
    metadata:
      annotations:
        storage.company.io/backup: "true"
    params:
      size: 500Gi
      accessMode: ReadOnlyMany
  model-cache:
    type: volume
    class: ssd
    params:
      size: 100Gi
      accessMode: ReadWriteOnce
  feature-cache:
    type: volume
    class: ephemeral
    params:
      size: 20Gi
  observability:
    type: service
    id: shared.observability
    params:
      traces: true
      metrics: true
      logs: true
      customMetrics:
        - prediction_latency
        - batch_size
        - cache_hit_rate