# Monitors Text Generation Inference Servers (TGIS). # Depending on your configuration, you might have to modify labels if you have customized them # from the defaults. apiVersion: metrics.turbonomic.io/v1alpha1 kind: PrometheusQueryMapping metadata: name: text-generation-inference labels: mapping: text-generation-inference spec: entities: - type: application attributes: - label: container name: container - isIdentifier: true label: instance matches: \d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?? name: ip - label: namespace name: namespace - label: pod name: pod # If service is not available please use the below for service. # This tries to guess the service by looking at the pod name. # If your pod does not follow this convention, then this method # will not work. # This regex uses 2 sub regex patterns to catch service name # 1.(.*)-[a-f0-9]{10}-[a-z0-9]{5}$ # This will match when deployment/service name <47 chars where # kubernetes doesn't do any name shortening before naming pods. # [a-f0-9]{10} checks for replicaset hash, which is 10 chars # when there's no shortening. # [a-z0-9]{5}$ checks for pod hash which is always present at the # last 5 chars. # 2.(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$ # If deployment/service name >= 47 and <=57 chars, kubernetes # truncates hyphens, replicaset hash and then deployment, so as to adhere # to 63 character limit for pod. # [a-f0-9]{0,10}[a-z0-9]{5}$ checks for merged replicaset hash, pod hash # when there's no hyphens present between both hashes. # 1 and 2 both won't work if deployment is > 57 chars. #- label: pod # as: service # name: service # matches: (.*)-[a-f0-9]{10}-[a-z0-9]{5}$|(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$ - label: service name: service # If service is not available please use the below for service. # This tries to guess the service by looking at the pod name. # If your pod does not follow this convention, then this method # will not work. # This regex uses 2 sub regex patterns to catch service name # 1.(.*)-[a-f0-9]{10}-[a-z0-9]{5}$ # This will match when deployment/service name <47 chars where # kubernetes doesn't do any name shortening before naming pods. # [a-f0-9]{10} checks for replicaset hash, which is 10 chars # when there's no shortening. # [a-z0-9]{5}$ checks for pod hash which is always present at the # last 5 chars. # 2.(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$ # If deployment/service name >= 47 and <=57 chars, kubernetes # truncates hyphens, replicaset hash and then deployment, so as to adhere # to 63 character limit for pod. # [a-f0-9]{0,10}[a-z0-9]{5}$ checks for merged replicaset hash, pod hash # when there's no hyphens present between both hashes. # 1 and 2 both won't work if deployment is > 57 chars. #- label: pod # as: service_name # name: service_name # matches: (.*)-[a-f0-9]{10}-[a-z0-9]{5}$|(.*)-[a-f0-9]{0,10}[a-z0-9]{5}$ - label: service name: service_name - label: namespace name: service_ns metrics: - type: transaction queries: - type: used # This query exhibits the following pattern: "A OR B + C OR D == 0". # 1) This is because A (tgi_request_total_tokens_sum) is only available in the IBM variant of the TGI # implementation, and not in the original HuggingFace one in which only B (tgi_request_input_length_sum) # and C (tgi_request_generated_tokens_sum) exist. Their sum is A. # 2) If neither A nor B + C is available, we will take D if D is 0. # More details: # - "A" and "B+C" are both 99th-percentile measurement of the total number of tokens. In LLM servings, it's # generally regarded a better measurement of the throughput than the total request count, because token count # per request varies a lot: imagine chat response which could be thousands of tokens vs. classification which # could just be a few tokens. # - "D" is the number of requests. The "D" portion of the expression ensures proper handling of the # scenario that there are zero observations in which ase we'd want this query to return 0. However, the # "token count" metric will not be 0; instead, it will be unavailable as there are no requests to count the # tokens. To address this, we append the expression with "OR D == 0" portion, which will return 0 because # the request count is 0. # Note: it is theoretically impossible that the request count is non-zero and the token count is unavailable. promql: histogram_quantile(0.99, rate(tgi_request_total_tokens_bucket{}[30m])) OR histogram_quantile(0.99, rate(tgi_request_input_length_bucket{}[30m])) + histogram_quantile(0.99, rate(tgi_request_generated_tokens_bucket{}[30m])) OR rate(tgi_request_count{}[30m]) == 0 - type: queuingTime queries: - type: used # The 99th-percentile of queuing time in milliseconds over the past 30 minutes. # If there are zero observations over the 30 minutes, then the value 0 will be returned. promql: 1000 * (histogram_quantile(0.99, rate(tgi_request_queue_duration_bucket{}[30m])) OR rate(tgi_request_count{}[30m]) == 0) - type: responseTime queries: - type: used # The 99th-percentile of response time in milliseconds over the past 30 minutes. # This query will return nothing (missing data) if there are zero observations, which we think is the # correct behavior because without any requests recorded we can't really measure the response time which is # certainly not zero. promql: 1000 * (histogram_quantile(0.99, rate(tgi_request_duration_bucket{}[30m]))) - type: serviceTime queries: - type: used # The 99th-percentile of "service"" time in milliseconds over the past 30 minutes. "serviceTime" measures # the TPOT (time-per-output-token). # This query will return nothing (missing data) when there are zero observations during the period, which we # think is the correct behavior because without any requests recorded we can't really measure the service time # which is certainly not zero. # A note on the TGI "method" filter below. There are two types when coming to measuring inference duration: # - One is "prefill" which measures the time to first (output) token (TTFT). This value varies according to the # input token length. # - The other is called "decode" or "next_token", which measures the TPOT that is relatively stable per model. # That means we can set a meaningful SLO for this metric per model and use it to drive the scaling. # We use a negative filter below {method != "prefill"}, instead of {method = "next_token"}, to make the query # work for both variants of TGI. The HuggingFace variant uses the term "decode", while the IBM variant uses # "next_token". We could also use regex pattern: {{method =~ "next_token|decode"}}. promql: 1000 * (histogram_quantile(0.99, rate(tgi_batch_inference_duration_bucket{method != "prefill"}[30m])) - type: concurrentQueries queries: - type: used # A similar "A >= B OR B OR C == 0" as above to retrieve the current batch size, with a 0 value under the # scenario of zero-transaction. promql: avg_over_time(tgi_batch_current_size{}[10m]) > avg_over_time(tgi_batch_current_size{}[1h]) OR avg_over_time(tgi_batch_current_size{}[1h]) OR rate(tgi_request_input_count{}[1h]) == 0 # If more desired so, replace the above with the following to simply retrieve one single 10-minute moving average. # promql: avg_over_time(tgi_batch_current_size{}[10m]) OR rate(tgi_request_input_count{}[1h]) == 0