{ "widgets": [ { "type": "metric", "x": 0, "y": 1, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName, Namespace} envoy_server_live', 'Sum', 60))", "label": "Count", "id": "e2", "region": "{{YOUR_AWS_REGION}}" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 60, "title": "Meshed Pods" } }, { "type": "metric", "x": 0, "y": 4, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "AVG(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_server_uptime', 'Average', 60)/3600))", "label": "hour", "id": "e1", "region": "{{YOUR_AWS_REGION}}" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 300, "setPeriodToTimeRange": false, "title": "Average Uptime" } }, { "type": "metric", "x": 12, "y": 4, "width": 6, "height": 3, "properties": { "metrics": [ [ { "expression": "AVG(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_cx_connect_timeout', 'Average', 60)))", "id": "e2" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 300, "start": "-PT1H", "end": "P0D", "setPeriodToTimeRange": false, "title": "Connection Timeouts" } }, { "type": "metric", "x": 0, "y": 7, "width": 18, "height": 6, "properties": { "metrics": [ [ { "expression": "SUM(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_cx_rx_bytes_total', 'Sum', 60)))/60", "id": "e1", "label": "[max: ${MAX}, avg: ${AVG}] InBound (TX)", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_cx_tx_bytes_total', 'Sum', 60)))/60", "id": "e2", "label": "[max: ${MAX}, avg: ${AVG}] OutBound (RX)", "region": "{{YOUR_AWS_REGION}}" } ] ], "view": "timeSeries", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 300, "setPeriodToTimeRange": false, "title": "Total Traffic (inbound & outbound) " } }, { "type": "metric", "x": 3, "y": 4, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "SUM(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_cx_rx_bytes_total', 'Sum', 60)))/60", "id": "e1", "region": "{{YOUR_AWS_REGION}}", "label": "kbps" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 60, "setPeriodToTimeRange": false, "title": "Inbound Traffic" } }, { "type": "metric", "x": 6, "y": 4, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "SUM(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_cx_tx_bytes_total', 'Sum', 60)))/60", "id": "e1", "region": "{{YOUR_AWS_REGION}}", "label": "kbps" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 60, "setPeriodToTimeRange": false, "title": "Outbound Traffic" } }, { "type": "metric", "x": 9, "y": 4, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "SUM(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_membership_total', 'Sum', 60))) - SUM(REMOVE_EMPTY(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_membership_healthy', 'Sum', 60)))", "id": "e1", "region": "{{YOUR_AWS_REGION}}", "label": "" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 300, "setPeriodToTimeRange": false, "title": "Unhealthy Pods" } }, { "type": "metric", "x": 9, "y": 1, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_server_memory_heap_size', 'Sum', 60))", "id": "e2", "period": 300, "region": "{{YOUR_AWS_REGION}}", "label": "" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 300, "setPeriodToTimeRange": false, "title": "Total Envoy Heap" } }, { "type": "metric", "x": 12, "y": 1, "width": 6, "height": 3, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_server_memory_allocated', 'Sum', 60))", "id": "e2", "period": 300, "region": "{{YOUR_AWS_REGION}}", "label": "" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 300, "setPeriodToTimeRange": false, "title": "Total Envoy Memory Usage" } }, { "type": "metric", "x": 3, "y": 1, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_http_downstream_rq_total', 'Sum', 60))/60", "id": "e1", "label": "rps" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 300, "start": "-PT1H", "end": "P0D", "setPeriodToTimeRange": false, "title": "Requests/Second" } }, { "type": "metric", "x": 6, "y": 1, "width": 3, "height": 3, "properties": { "metrics": [ [ { "expression": "(SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx$ (envoy_http_conn_manager_prefix=\"ingress\" OR envoy_http_conn_manager_prefix=\"egress\") envoy_response_code_class=\"4\"', 'Sum', 60)) + SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx$ (envoy_http_conn_manager_prefix=\"ingress\" OR envoy_http_conn_manager_prefix=\"egress\") envoy_response_code_class=\"5\"', 'Sum', 60)))/60", "id": "e2", "period": 60, "label": "" } ] ], "view": "singleValue", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Sum", "period": 60, "setPeriodToTimeRange": false, "title": "Failures/Second" } }, { "type": "metric", "x": 0, "y": 13, "width": 9, "height": 6, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"ingress\" envoy_response_code_class=\"2\"', 'Sum', 60))/60", "id": "e3", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 2XX" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"ingress\" envoy_response_code_class=\"1\"', 'Sum', 60))/60", "id": "e2", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 1XX" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"ingress\" envoy_response_code_class=\"3\"', 'Sum', 60))/60", "id": "e4", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 3XX" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"ingress\" envoy_response_code_class=\"4\"', 'Sum', 60))/60", "id": "e5", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 4XX" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"ingress\" envoy_response_code_class=\"5\"', 'Sum', 60))/60", "id": "e1", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 5XX" } ] ], "view": "timeSeries", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Average", "period": 300, "title": "Ingress HTTP Requests/sec" } }, { "type": "metric", "x": 9, "y": 13, "width": 9, "height": 6, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"egress\" envoy_response_code_class=\"2\"', 'Sum', 60))/60", "id": "e3", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 2XX", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"egress\" envoy_response_code_class=\"1\"', 'Sum', 60))/60", "id": "e2", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 1XX", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"egress\" envoy_response_code_class=\"3\"', 'Sum', 60))/60", "id": "e4", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 3XX", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"egress\" envoy_response_code_class=\"4\"', 'Sum', 60))/60", "id": "e5", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 4XX", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace,envoy_http_conn_manager_prefix,envoy_response_code_class} envoy_http_downstream_rq_xx envoy_http_conn_manager_prefix=\"egress\" envoy_response_code_class=\"5\"', 'Sum', 60))/60", "id": "e1", "label": "[max: ${MAX}, avg: ${AVG}] HTTP 5XX", "region": "{{YOUR_AWS_REGION}}" } ] ], "view": "timeSeries", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Average", "period": 300, "title": "Egress HTTP Requests/sec" } }, { "type": "metric", "x": 0, "y": 19, "width": 9, "height": 6, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_pending_failure_eject', 'Sum', 60))/60", "id": "e2", "label": "[max: ${MAX}, avg: ${AVG}] pending failure ejection" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_pending_overflow', 'Sum', 60))/60", "id": "e3", "label": "[max: ${MAX}, avg: ${AVG}] pending overflow" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_cx_connect_timeout', 'Sum', 60))/60", "id": "e1", "label": "[max: ${MAX}, avg: ${AVG}] connect timeout" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_timeout', 'Sum', 60))/60", "id": "e4", "label": "[max: ${MAX}, avg: ${AVG}] request timeout" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_per_try_timeout', 'Sum', 60))/60", "id": "e5", "label": "[max: ${MAX}, avg: ${AVG}] per try request timeout" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_rx_reset', 'Sum', 60))/60", "id": "e6", "label": "[max: ${MAX}, avg: ${AVG}] request reset" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_cx_destroy_local_with_active_rq', 'Sum', 60))/60", "id": "e7", "label": "[max: ${MAX}, avg: ${AVG}] destroy initialized from originating service" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_http_downstream_cx_destroy_remote_active_rq', 'Sum', 60))/60", "id": "e8", "label": "[max: ${MAX}, avg: ${AVG}] destroy initialized from remote service" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_maintenance_mode', 'Sum', 60))/60", "id": "e9", "label": "[max: ${MAX}, avg: ${AVG}] request failed maintenance mode" } ] ], "view": "timeSeries", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Average", "period": 300, "title": "Upstream Request Errors" } }, { "type": "metric", "x": 9, "y": 19, "width": 9, "height": 6, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_flow_control_paused_reading_total', 'Sum', 60))/60", "id": "e1", "label": "[max: ${MAX}, avg: ${AVG}] paused reading from destination service", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_flow_control_resumed_reading_total', 'Sum', 60))/60", "id": "e2", "label": "[max: ${MAX}, avg: ${AVG}] resumed reading from destination service", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_flow_control_backed_up_total', 'Sum', 60))/60", "id": "e3", "label": "[max: ${MAX}, avg: ${AVG}] paused reading from originating service", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_flow_control_drained_total', 'Sum', 60))/60", "id": "e4", "label": "[max: ${MAX}, avg: ${AVG}] resumed reading from originating service", "region": "{{YOUR_AWS_REGION}}" } ] ], "view": "timeSeries", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Average", "period": 300, "title": "Upstream Flow Control" } }, { "type": "metric", "x": 0, "y": 25, "width": 18, "height": 6, "properties": { "metrics": [ [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_flow_control_resumed_reading_total', 'Sum', 60))/60", "id": "e2", "label": "[max: ${MAX}, avg: ${AVG}] request retry", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_retry_success', 'Sum', 60))/60", "id": "e1", "label": "[max: ${MAX}, avg: ${AVG}] request retry success", "region": "{{YOUR_AWS_REGION}}" } ], [ { "expression": "SUM(SEARCH('{ContainerInsights/Prometheus,ClusterName,Namespace} envoy_cluster_upstream_rq_retry_overflow', 'Sum', 60))/60", "id": "e3", "label": "[max: ${MAX}, avg: ${AVG}] request retry overflow", "region": "{{YOUR_AWS_REGION}}" } ] ], "view": "timeSeries", "stacked": false, "region": "{{YOUR_AWS_REGION}}", "stat": "Average", "period": 300, "title": "Upstream Request Retry" } }, { "type": "text", "x": 0, "y": 0, "width": 18, "height": 1, "properties": { "markdown": "\n# K8S AWS App Mesh\n" } }, { "type": "log", "x": 0, "y": 31, "width": 18, "height": 6, "properties": { "query": "SOURCE '/aws/containerinsights/{{YOUR_CLUSTER_NAME}}/prometheus' | fields Namespace, app, pod_name, @message\n| limit 50\n| filter CloudWatchMetrics.0.Namespace=\"ContainerInsights/Prometheus\"\n| filter CloudWatchMetrics.0.Metrics.0.Name like /envoy_/\n| stats sum(envoy_cluster_upstream_cx_rx_bytes_buffered) as bytes_rx_buffered, sum(envoy_cluster_upstream_cx_tx_bytes_buffered) as bytes_tx_buffered, sum(envoy_http_downstream_rq_xx) as http_downstream_rq_xx by app, pod_name\n| sort bytes_rx_buffered desc \n", "region": "{{YOUR_AWS_REGION}}", "stacked": false, "title": "Summary by App", "view": "table" } } ] }