# Monitors Text Generation Inference Servers (TGIS).
# Depending on your configuration, you might have to modify labels if you have customized them
# from the defaults.
apiVersion: metrics.turbonomic.io/v1alpha1
kind: PrometheusQueryMapping
metadata:
  name: text-generation-inference
  labels:
    mapping: text-generation-inference
spec:
  entities:
  - type: application
    attributes:
    - label: container
      name: container
    - isIdentifier: true
      label: instance
      matches: \d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})??
      name: ip
    - label: namespace
      name: namespace
    - label: pod
      name: pod
    # If service is not available please use the below for service.
    # This tries to guess the service by looking at the pod name.
    # If your pod does not follow this convention, then this method
    # will not work.
    # This regex uses 2 sub regex patterns to catch service name
    # 1.(.*)-[a-f0-9]{10}-[a-z0-9]{5}$
    #   This will match when deployment/service name <47 chars where 
    #   kubernetes doesn't do any name shortening before naming pods. 
    #   [a-f0-9]{10} checks for replicaset hash, which is 10 chars
    #   when there's no shortening.
    #   [a-z0-9]{5}$ checks for pod hash which is always present at the
    #   last 5 chars.
    # 2.(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$
    #   If deployment/service name >= 47 and <=57 chars, kubernetes 
    #   truncates hyphens, replicaset hash and then deployment, so as to adhere 
    #   to 63 character limit for pod.
    #   [a-f0-9]{0,10}[a-z0-9]{5}$ checks for merged replicaset hash, pod hash
    #   when there's no hyphens present between both hashes.
    # 1 and 2 both won't work if deployment is > 57 chars.
    #- label: pod
    #  as: service
    #  name: service
    #  matches: (.*)-[a-f0-9]{10}-[a-z0-9]{5}$|(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$
    - label: service
      name: service
    # If service is not available please use the below for service.
    # This tries to guess the service by looking at the pod name.
    # If your pod does not follow this convention, then this method
    # will not work.
    # This regex uses 2 sub regex patterns to catch service name
    # 1.(.*)-[a-f0-9]{10}-[a-z0-9]{5}$
    #   This will match when deployment/service name <47 chars where 
    #   kubernetes doesn't do any name shortening before naming pods. 
    #   [a-f0-9]{10} checks for replicaset hash, which is 10 chars
    #   when there's no shortening.
    #   [a-z0-9]{5}$ checks for pod hash which is always present at the
    #   last 5 chars.
    # 2.(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$
    #   If deployment/service name >= 47 and <=57 chars, kubernetes 
    #   truncates hyphens, replicaset hash and then deployment, so as to adhere 
    #   to 63 character limit for pod.
    #   [a-f0-9]{0,10}[a-z0-9]{5}$ checks for merged replicaset hash, pod hash
    #   when there's no hyphens present between both hashes.
    # 1 and 2 both won't work if deployment is > 57 chars.     
    #- label: pod
    #  as: service_name
    #  name: service_name
    #  matches: (.*)-[a-f0-9]{10}-[a-z0-9]{5}$|(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$
    - label: service
      name: service_name
    - label: namespace
      name: service_ns
    metrics:
    - type: transaction
      queries:
      - type: used
        # This query exhibits the following pattern: "A OR B + C OR D == 0".
        #   1) This is because A (tgi_request_total_tokens_sum) is only available in the IBM variant of the TGI
        #      implementation, and not in the original HuggingFace one in which only B (tgi_request_input_length_sum)
        #      and C (tgi_request_generated_tokens_sum) exist.  Their sum is A.
        #   2) If neither A nor B + C is available, we will take D if D is 0.
        # More details:
        # - "A" and "B+C" are both 99th-percentile measurement of the total number of tokens.  In LLM servings, it's
        #   generally regarded a better measurement of the throughput than the total request count, because token count
        #   per request varies a lot: imagine chat response which could be thousands of tokens vs. classification which
        #   could just be a few tokens.
        # - "D" is the number of requests.  The "D" portion of the expression ensures proper handling of the
        #   scenario that there are zero observations in which ase we'd want this query to return 0.  However, the
        #   "token count" metric will not be 0; instead, it will be unavailable as there are no requests to count the
        #   tokens.  To address this, we append the expression with "OR D == 0" portion, which will return 0 because
        #   the request count is 0.
        # Note: it is theoretically impossible that the request count is non-zero and the token count is unavailable.
        promql: histogram_quantile(0.99, rate(tgi_request_total_tokens_bucket{}[30m]))
          OR histogram_quantile(0.99, rate(tgi_request_input_length_bucket{}[30m])) + histogram_quantile(0.99, rate(tgi_request_generated_tokens_bucket{}[30m]))
          OR rate(tgi_request_count{}[30m]) == 0
    - type: queuingTime
      queries:
        - type: used
          # The 99th-percentile of queuing time in milliseconds over the past 30 minutes.
          # If there are zero observations over the 30 minutes, then the value 0 will be returned.
          promql: 1000 * (histogram_quantile(0.99, rate(tgi_request_queue_duration_bucket{}[30m]))
            OR rate(tgi_request_count{}[30m]) == 0)
    - type: responseTime
      queries:
        - type: used
          # The 99th-percentile of response time in milliseconds over the past 30 minutes.
          # This query will return nothing (missing data) if there are zero observations, which we think is the
          # correct behavior because without any requests recorded we can't really measure the response time which is
          # certainly not zero.
          promql: 1000 * (histogram_quantile(0.99, rate(tgi_request_duration_bucket{}[30m])))
    - type: serviceTime
      queries:
        - type: used
          # The 99th-percentile of "service"" time in milliseconds over the past 30 minutes.  "serviceTime" measures
          # the TPOT (time-per-output-token).
          # This query will return nothing (missing data) when there are zero observations during the period, which we
          # think is the correct behavior because without any requests recorded we can't really measure the service time
          # which is certainly not zero.
          # A note on the TGI "method" filter below.  There are two types when coming to measuring inference duration:
          # - One is "prefill" which measures the time to first (output) token (TTFT).  This value varies according to the
          #   input token length.
          # - The other is called "decode" or "next_token", which measures the TPOT that is relatively stable per model.
          #   That means we can set a meaningful SLO for this metric per model and use it to drive the scaling.
          # We use a negative filter below {method != "prefill"}, instead of {method = "next_token"}, to make the query
          # work for both variants of TGI.  The HuggingFace variant uses the term "decode", while the IBM variant uses
          # "next_token".  We could also use regex pattern: {{method =~ "next_token|decode"}}.
          promql: 1000 * (histogram_quantile(0.99, rate(tgi_batch_inference_duration_bucket{method != "prefill"}[30m]))
    - type: concurrentQueries
      queries:
      - type: used
        # A similar "A >= B OR B OR C == 0" as above to retrieve the current batch size, with a 0 value under the
        # scenario of zero-transaction.
        promql: avg_over_time(tgi_batch_current_size{}[10m]) > avg_over_time(tgi_batch_current_size{}[1h])
          OR avg_over_time(tgi_batch_current_size{}[1h]) OR rate(tgi_request_input_count{}[1h]) == 0
        # If more desired so, replace the above with the following to simply retrieve one single 10-minute moving average.
        # promql: avg_over_time(tgi_batch_current_size{}[10m]) OR rate(tgi_request_input_count{}[1h]) == 0