{ "__inputs": [ { "name": "DS_PROMETHEUS", "label": "Prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" }, { "type": "panel", "id": "table", "name": "Table", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "limit": 100, "name": "Annotations & Alerts", "showIn": 0, "type": "dashboard" } ] }, "description": "KUDO Kafka Cluster Dashboard", "editable": true, "gnetId": 9018, "graphTooltip": 0, "id": null, "iteration": 1574944426067, "links": [], "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 139, "panels": [], "title": "Overview", "type": "row" }, { "cacheTimeout": null, "colorBackground": false, "colorPostfix": true, "colorValue": true, "colors": [ "#962d82", "#ba43a9", "#d44a3a" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, "id": 147, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": " Brokers", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "count(kafka_server_KafkaServer_BrokerState{namespace=\"$namespace\",service=\"$service\"})", "format": "time_series", "instant": true, "interval": "", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Broker Count", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, "id": 144, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": " msg", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "#614d93", "full": false, "lineColor": "#962d82", "show": true }, "tableColumn": "", "targets": [ { "expr": "sum(kafka_server_BrokerTopicMetrics_MessagesIn_total{namespace=\"$namespace\",service=\"$service\", topic=\"\"})", "format": "time_series", "instant": false, "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Total Messages In", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_PROMETHEUS}", "format": "decbytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, "id": 145, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "#614d93", "full": false, "lineColor": "#962d82", "show": true }, "tableColumn": "", "targets": [ { "expr": "sum(kafka_server_BrokerTopicMetrics_BytesIn_total{namespace=\"$namespace\",service=\"$service\", topic=\"\"})", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Total Messages In Size", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": true, "colorPostfix": true, "colorValue": false, "colors": [ "#bf1b00", "#bf1b00", "#629e51" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, "id": 165, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "max(kafka_server_SessionExpireListener_ZooKeeperSyncConnects_total{service=\"$service\"})", "format": "time_series", "instant": true, "interval": "", "intervalFactor": 1, "refId": "A" } ], "thresholds": "0,1", "title": "Zookeeper Sync Connects / sec", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "#299c46", "#962d82", "#d44a3a" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, "id": 151, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "#705da0", "full": false, "lineColor": "#962d82", "show": true }, "tableColumn": "", "targets": [ { "expr": "max(kafka_server_ReplicaManager_LeaderCount{namespace=\"$namespace\",service=\"$service\"})", "format": "time_series", "instant": false, "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Total Leader Count", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "#299c46", "#962d82", "#d44a3a" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, "id": 167, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "#705da0", "full": false, "lineColor": "#962d82", "show": true }, "tableColumn": "", "targets": [ { "expr": "count(count by (topic) (kafka_server_BrokerTopicMetrics_MessagesIn_total{namespace=\"$namespace\",service=\"$service\"}))", "format": "time_series", "instant": false, "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Total Topic Count", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_PROMETHEUS}", "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 5 }, "id": 161, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "#705da0", "full": false, "lineColor": "#962d82", "show": false }, "tableColumn": "", "targets": [ { "expr": "avg(kafka_network_SocketServer_NetworkProcessorAvgIdlePercent{namespace=\"$namespace\",service=\"$service\"})", "format": "time_series", "instant": true, "intervalFactor": 1, "refId": "A" } ], "thresholds": "50,80", "title": "Request Handler Avg Load %", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": true, "colorValue": false, "colors": [ "#629e51", "#629e51", "#bf1b00" ], "datasource": "${DS_PROMETHEUS}", "description": "In a healthy cluster, the number of in sync replicas (ISRs) should be exactly equal to the total number of replicas. If partition replicas fall too far behind their leaders, the follower partition is removed from the ISR pool, and you should see a corresponding increase in IsrShrinksPerSec. Since Kafka’s high-availability guarantees cannot be met without replication, investigation is certainly warranted should this metric value exceed zero for extended time periods.\n\nIf this metric has a value greater than 1 it means that data is not being replicated to enough number of brokers thereby increasing the probability of data loss.", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 4, "y": 5 }, "id": 142, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "#890f02", "show": true }, "tableColumn": "", "targets": [ { "expr": "sum(kafka_server_ReplicaManager_UnderReplicatedPartitions{service=\"$service\", namespace=\"$namespace\"})", "format": "time_series", "instant": false, "intervalFactor": 1, "refId": "A" } ], "thresholds": "0,1", "title": "Under Replicated Partitions for $service", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": true, "colorPostfix": true, "colorValue": false, "colors": [ "#bf1b00", "#bf1b00", "#629e51" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 5 }, "id": 164, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(kafka_controller_KafkaController_ActiveControllerCount{namespace=\"$namespace\",service=\"$service\"})", "format": "time_series", "instant": true, "interval": "", "intervalFactor": 1, "refId": "A" } ], "thresholds": "0,1", "title": "Active Controller Count", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "#299c46", "#962d82", "#d44a3a" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 5 }, "id": 149, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "#705da0", "full": false, "lineColor": "#962d82", "show": true }, "tableColumn": "", "targets": [ { "expr": "max(kafka_server_ReplicaManager_PartitionCount{namespace=\"$namespace\",service=\"$service\"})", "format": "time_series", "instant": false, "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "title": "Total Partition Count", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "0", "value": "null" } ], "valueName": "current" }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" }, { "evaluator": { "params": [ 1 ], "type": "lt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "ActiveControllerCount alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "description": "The first node to boot in a Kafka cluster automatically becomes the controller, and there can be only one. The controller in a Kafka cluster is responsible for maintaining the list of partition leaders, and coordinating leadership transitions (in the event a partition leader becomes unavailable). If it becomes necessary to replace the controller, a new controller is randomly chosen by ZooKeeper from the pool of brokers. In general, it is not possible for this value to be greater than one, but you should definitely alert on a value of zero that lasts for more than a short period (< 1s) of time.", "fill": 10, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 5 }, "id": 114, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum(kafka_controller_KafkaController_ActiveControllerCount{namespace=\"$namespace\",service=\"$service\"})by(pod, service)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1, "yaxis": "left" } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "ActiveController for ($service)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Active", "logBase": 1, "max": "1", "min": "0", "show": false }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorPostfix": true, "colorValue": true, "colors": [ "#37872D", "#E0B400", "#C4162A" ], "datasource": "${DS_PROMETHEUS}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 5 }, "id": 181, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "pluginVersion": "6.1.6", "postfix": "%", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "100 * (node_filesystem_size_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\",pod=\"$broker\"} - node_filesystem_avail_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\",pod=\"$broker\"}) / node_filesystem_size_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\",pod=\"$broker\"}", "format": "time_series", "instant": true, "intervalFactor": 1, "legendFormat": "USED: {{$broker}}", "refId": "B" } ], "thresholds": "60,80,80", "title": "Disk Usage: $broker", "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 171, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "jvm_memory_bytes_used{namespace=\"$namespace\",service=\"$service\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}} {{area}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory Used / Broker", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "columns": [], "fontSize": "100%", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 182, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 0, "desc": true }, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "date" }, { "alias": "Disk Used", "colorMode": "row", "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "Value", "thresholds": [ "60", "80", "80" ], "type": "number", "unit": "percent" }, { "alias": "Broker: Mount Path", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "Metric", "thresholds": [], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "100 * (node_filesystem_size_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\"} - node_filesystem_avail_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\"}) / node_filesystem_size_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\"}", "format": "time_series", "instant": true, "intervalFactor": 1, "legendFormat": "{{pod}}: {{mountpoint}}", "refId": "A" } ], "timeFrom": null, "timeShift": null, "title": "Disk Used / Broker", "transform": "timeseries_to_rows", "type": "table" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 17 }, "id": 174, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(rate(kafka_server_BrokerTopicMetrics_MessagesIn_total{service=\"$service\", namespace=\"$namespace\", topic=\"\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "messages/second", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "messages in / sec", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 17 }, "id": 179, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum by(topic) (kafka_server_BrokerTopicMetrics_MessagesIn_total{namespace=\"$namespace\",service=\"$service\", topic=~\".+\"})", "format": "time_series", "interval": "15s", "intervalFactor": 1, "legendFormat": "{{topic}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Messages / Topic", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 24 }, "id": 172, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(kafka_server_BrokerTopicMetrics_BytesIn_total{namespace=\"$namespace\",service=\"$service\", topic=\"\"})", "format": "time_series", "interval": "15s", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "All Bytes In", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 24 }, "id": 173, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(kafka_server_BrokerTopicMetrics_BytesOut_total{namespace=\"$namespace\",service=\"$service\", topic=\"\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "All Bytes Out", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 31 }, "id": 177, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(rate(kafka_server_BrokerTopicMetrics_BytesIn_total{service=\"$service\", namespace=\"$namespace\", topic=\"\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "bytes in /second", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "bytes in / sec", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 38 }, "id": 40, "panels": [], "title": "Broker Info", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "description": "The first node to boot in a Kafka cluster automatically becomes the controller, and there can be only one. The controller in a Kafka cluster is responsible for maintaining the list of partition leaders, and coordinating leadership transitions (in the event a partition leader becomes unavailable). If it becomes necessary to replace the controller, a new controller is randomly chosen by ZooKeeper from the pool of brokers. In general, it is not possible for this value to be greater than one, but you should definitely alert on a value of zero that lasts for more than a short period (< 1s) of time.", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 39 }, "id": 117, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(kafka_controller_KafkaController_GlobalPartitionCount{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Partition Count (Broker) for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "description": "The number of in-sync replicas (ISRs) for a particular partition should remain fairly static, the only exceptions are when you are expanding your broker cluster or removing partitions. In order to maintain high availability, a healthy Kafka cluster requires a minimum number of ISRs for failover. A replica could be removed from the ISR pool for a couple of reasons: it is too far behind the leader’s offset (user-configurable by setting the replica.lag.max.messages configuration parameter), or it has not contacted the leader for some time (configurable with the replica.socket.timeout.ms parameter). No matter the reason, an increase in IsrShrinksPerSec without a corresponding increase in IsrExpandsPerSec shortly thereafter is cause for concern and requires user intervention.The Kafka documentation provides a wealth of information on the user-configurable parameters for brokers.", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 39 }, "id": 119, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(kafka_server_ReplicaManager_IsrShrinks_total{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "ISR Shrink Rate (Broker) for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "description": "This value should be 0", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 39 }, "id": 116, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(kafka_controller_KafkaController_OfflinePartitionsCount{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"})by(pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Offline Partitions for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "description": "Number of requests waiting in the producer purgatory. This should be non-zero when acks=all is used on the producer.", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 46 }, "id": 120, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(kafka_server_DelayedOperationPurgatory_NumDelayedOperations{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"})by(pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Producer Purgatory Requests Waiting for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "description": "The number of in-sync replicas (ISRs) for a particular partition should remain fairly static, the only exceptions are when you are expanding your broker cluster or removing partitions. In order to maintain high availability, a healthy Kafka cluster requires a minimum number of ISRs for failover. A replica could be removed from the ISR pool for a couple of reasons: it is too far behind the leader’s offset (user-configurable by setting the replica.lag.max.messages configuration parameter), or it has not contacted the leader for some time (configurable with the replica.socket.timeout.ms parameter). No matter the reason, an increase in IsrShrinksPerSec without a corresponding increase in IsrExpandsPerSec shortly thereafter is cause for concern and requires user intervention.The Kafka documentation provides a wealth of information on the user-configurable parameters for brokers.", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 46 }, "id": 118, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "kafka_server_ReplicaManager_IsrExpands_total{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "ISR Expansion Rate (Broker) for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "decimals": 0, "description": "", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 46 }, "id": 121, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(kafka_server_DelayedOperationPurgatory_PurgatorySize{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"})by(pod)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Delayed Fetch Operation Purgatory Size for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "Number of partitions on this broker. This should be mostly even across all brokers.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 53 }, "id": 60, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "(kafka_server_ReplicaManager_PartitionCount{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Partition Count Across Brokers for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 53 }, "id": 135, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "rate(kafka_server_BrokerTopicMetrics_MessagesIn_total{namespace=\"$namespace\", service=\"$service\", pod=~\"$broker\", topic=~\".+\"}[2m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{topic}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Derivative messages/sec for ($broker) / topic", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 60 }, "id": 20, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "(kafka_server_BrokerTopicMetrics_BytesIn_total{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\", topic=~\".+\"})", "format": "time_series", "interval": "15s", "intervalFactor": 1, "legendFormat": "{{topic}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "All Bytes In per (Broker) for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 60 }, "id": 22, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(kafka_server_BrokerTopicMetrics_BytesOut_total{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\", topic=~\".+\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{topic}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "All Bytes Out per (Broker) for ($service) broker ($broker)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 67 }, "id": 178, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(rate(kafka_server_BrokerTopicMetrics_BytesIn_total{namespace=\"$namespace\",service=\"$service\",pod=~\"$broker\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ $broker }} - bytes in /second ", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "bytes in / sec", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "columns": [], "datasource": "${DS_PROMETHEUS}", "fontSize": "100%", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 67 }, "id": 153, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 1, "desc": false }, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "date" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/^(__name__|container_id|dcos_cluster_id|dcos_cluster_name|namespace|fault_domain_region|fault_domain_zone|host|instance|job|metric_type|service|Time)$/", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "Count", "colorMode": "cell", "colors": [ "#7eb26d", "#7eb26d", "#bf1b00" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "Value", "thresholds": [ "0", "1" ], "type": "number", "unit": "short" } ], "targets": [ { "expr": "sum by(topic) (kafka_cluster_Partition_UnderReplicated{namespace=\"$namespace\",namespace=\"$namespace\",service=\"$service\"})", "format": "table", "instant": true, "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "title": "Under Replicated Partitions", "transform": "table", "type": "table" }, { "aliasColors": { "TOTAL: kafka-kafka-0": "dark-green", "TOTAL: kafka-kafka-1": "dark-green", "TOTAL: kafka-kafka-2": "dark-green", "TOTAL: kafka-kafka-3": "dark-green", "TOTAL: kafka-kafka-4": "dark-green", "TOTAL: kafka-kafka-5": "dark-green", "USED: kafka-kafka-0": "dark-red", "USED: kafka-kafka-1": "dark-red", "USED: kafka-kafka-2": "dark-red", "USED: kafka-kafka-3": "dark-red", "USED: kafka-kafka-4": "dark-red", "USED: kafka-kafka-5": "dark-red" }, "bars": false, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_PROMETHEUS}", "description": "Generally, disk throughput tends to be the main bottleneck in Kafka performance. However, that’s not to say that the network is never a bottleneck. Depending on your use case, hardware, and configuration, the network can quickly become the slowest segment of a message’s trip, especially if you are sending messages across data centers. Tracking network throughput on your brokers gives you more information as to where potential bottlenecks may lie, and can inform decisions like whether or not you should enable end-to-end compression of your messages.", "fill": 8, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 75 }, "id": 180, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": true, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_filesystem_size_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\",pod=\"$broker\"}", "format": "time_series", "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "TOTAL: {{pod}}", "refId": "B" }, { "expr": "node_filesystem_size_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\",pod=\"$broker\"} - node_filesystem_avail_bytes{mountpoint=\"/var/lib/kafka\",namespace=\"$namespace\",service=\"$service\",pod=\"$broker\"}", "format": "time_series", "instant": false, "intervalFactor": 1, "legendFormat": "USED: {{pod}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "refresh": false, "schemaVersion": 18, "style": "dark", "tags": [ "kafka", "data-services" ], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", "definition": "kafka_controller_ControllerStats_LeaderElectionRateAndTimeMs", "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "namespace", "options": [], "query": "kafka_controller_ControllerStats_LeaderElectionRateAndTimeMs", "refresh": 1, "regex": "/.*namespace=\\\"([^\\\"]*).*/", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", "definition": "kafka_controller_ControllerStats_LeaderElectionRateAndTimeMs", "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "service", "options": [], "query": "kafka_controller_ControllerStats_LeaderElectionRateAndTimeMs", "refresh": 1, "regex": "/.*service=\\\"([^\\\"]*).*/", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", "definition": "kafka_controller_ControllerStats_LeaderElectionRateAndTimeMs", "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "broker", "options": [], "query": "kafka_controller_ControllerStats_LeaderElectionRateAndTimeMs", "refresh": 1, "regex": "/.*pod=\\\"([^\\\"]*).*/", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "KUDO Kafka: Cluster Summary", "uid": "otWaoQkZj", "version": 1 }