# Derived from ./manifests --- apiVersion: v1 kind: Namespace metadata: name: monitoring --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRoleBinding metadata: name: prometheus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring --- apiVersion: rbac.authorization.k8s.io/v1beta1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: [""] resources: - nodes - nodes/proxy - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: [""] resources: - configmaps verbs: ["get"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- apiVersion: v1 kind: ServiceAccount metadata: name: prometheus-k8s namespace: monitoring --- apiVersion: v1 data: default.tmpl: | {{ define "__alertmanager" }}AlertManager{{ end }} {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} {{ define "__description" }}{{ end }} {{ define "__text_alert_list" }}{{ range . }}Labels: {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} {{ end }}Annotations: {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} {{ end }}Source: {{ .GeneratorURL }} {{ end }}{{ end }} {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} {{ define "slack.default.pretext" }}{{ end }} {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} {{ define "slack.default.iconemoji" }}{{ end }} {{ define "slack.default.iconurl" }}{{ end }} {{ define "slack.default.text" }}{{ end }} {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }} {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }} {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }} {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }} {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }} {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }} {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }} {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} {{ if gt (len .Alerts.Firing) 0 -}} Alerts Firing: {{ template "__text_alert_list" .Alerts.Firing }} {{- end }} {{ if gt (len .Alerts.Resolved) 0 -}} Alerts Resolved: {{ template "__text_alert_list" .Alerts.Resolved }} {{- end }} {{- end }} {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }} {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }} {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }} {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }} {{ define "email.default.html" }} {{ template "__subject" . }}
{{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} {{ .Name }}={{ .Value }} {{ end }}
{{ if gt (len .Alerts.Firing) 0 }} {{ end }} {{ range .Alerts.Firing }} {{ end }} {{ if gt (len .Alerts.Resolved) 0 }} {{ if gt (len .Alerts.Firing) 0 }} {{ end }} {{ end }} {{ range .Alerts.Resolved }} {{ end }}
View in {{ template "__alertmanager" . }}
[{{ .Alerts.Firing | len }}] Firing
Labels
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} Source



[{{ .Alerts.Resolved | len }}] Resolved
Labels
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} Source
{{ end }} {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }} {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} {{ if gt (len .Alerts.Firing) 0 }} Alerts Firing: {{ template "__text_alert_list" .Alerts.Firing }} {{ end }} {{ if gt (len .Alerts.Resolved) 0 }} Alerts Resolved: {{ template "__text_alert_list" .Alerts.Resolved }} {{ end }} {{ end }} {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }} slack.tmpl: | {{ define "slack.devops.text" }} {{range .Alerts}}{{.Annotations.DESCRIPTION}} {{end}} {{ end }} kind: ConfigMap metadata: creationTimestamp: null name: alertmanager-templates namespace: monitoring --- kind: ConfigMap apiVersion: v1 metadata: name: alertmanager namespace: monitoring data: config.yml: |- global: # ResolveTimeout is the time after which an alert is declared resolved # if it has not been updated. resolve_timeout: 5m # The smarthost and SMTP sender used for mail notifications. smtp_smarthost: 'smtp.gmail.com:587' smtp_from: 'foo@bar.com' smtp_auth_username: 'foo@bar.com' smtp_auth_password: 'barfoo' # The API URL to use for Slack notifications. slack_api_url: 'https://hooks.slack.com/services/some/api/token' # # The directory from which notification templates are read. templates: - '/etc/alertmanager-templates/*.tmpl' # The root route on which each incoming alert enters. route: # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. group_by: ['alertname', 'cluster', 'service'] # When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. # This way ensures that you get multiple alerts for the same group that start # firing shortly after another are batched together on the first # notification. group_wait: 30s # When the first notification was sent, wait 'group_interval' to send a batch # of new alerts that started firing for that group. group_interval: 5m # If an alert has successfully been sent, wait 'repeat_interval' to # resend them. #repeat_interval: 1m repeat_interval: 15m # A default receiver # If an alert isn't caught by a route, send it to default. receiver: default # All the above attributes are inherited by all child routes and can # overwritten on each. # The child route trees. routes: # Send severity=slack alerts to slack. - match: severity: slack receiver: slack_alert # - match: # severity: email # receiver: email_alert receivers: - name: 'default' slack_configs: - channel: '#alertmanager-test' text: '{{ template "slack.devops.text" . }}' send_resolved: true - name: 'slack_alert' slack_configs: - channel: '#alertmanager-test' send_resolved: true --- apiVersion: apps/v1 kind: Deployment metadata: name: alertmanager namespace: monitoring spec: replicas: 1 selector: matchLabels: app: alertmanager template: metadata: name: alertmanager labels: app: alertmanager spec: containers: - name: alertmanager image: quay.io/prometheus/alertmanager:v0.7.1 args: - '-config.file=/etc/alertmanager/config.yml' - '-storage.path=/alertmanager' ports: - name: alertmanager containerPort: 9093 volumeMounts: - name: config-volume mountPath: /etc/alertmanager - name: templates-volume mountPath: /etc/alertmanager-templates - name: alertmanager mountPath: /alertmanager volumes: - name: config-volume configMap: name: alertmanager - name: templates-volume configMap: name: alertmanager-templates - name: alertmanager emptyDir: {} --- apiVersion: v1 kind: Service metadata: annotations: prometheus.io/scrape: 'true' prometheus.io/path: '/metrics' labels: name: alertmanager name: alertmanager namespace: monitoring spec: selector: app: alertmanager type: NodePort ports: - name: alertmanager protocol: TCP port: 9093 targetPort: 9093 --- apiVersion: apps/v1 kind: Deployment metadata: name: grafana-core namespace: monitoring labels: app: grafana component: core spec: replicas: 1 selector: matchLabels: app: grafana template: metadata: labels: app: grafana component: core spec: containers: - image: grafana/grafana:4.2.0 name: grafana-core imagePullPolicy: IfNotPresent # env: resources: # keep request = limit to keep this container in guaranteed class limits: cpu: 100m memory: 100Mi requests: cpu: 100m memory: 100Mi env: # The following env variables set up basic auth twith the default admin user and admin password. - name: GF_AUTH_BASIC_ENABLED value: "true" - name: GF_SECURITY_ADMIN_USER valueFrom: secretKeyRef: name: grafana key: admin-username - name: GF_SECURITY_ADMIN_PASSWORD valueFrom: secretKeyRef: name: grafana key: admin-password - name: GF_AUTH_ANONYMOUS_ENABLED value: "false" # - name: GF_AUTH_ANONYMOUS_ORG_ROLE # value: Admin # does not really work, because of template variables in exported dashboards: # - name: GF_DASHBOARDS_JSON_ENABLED # value: "true" readinessProbe: httpGet: path: /login port: 3000 # initialDelaySeconds: 30 # timeoutSeconds: 1 volumeMounts: - name: grafana-persistent-storage mountPath: /var/lib/grafana volumes: - name: grafana-persistent-storage emptyDir: {} --- apiVersion: v1 data: grafana-net-2-dashboard.json: | { "__inputs": [{ "name": "DS_PROMETHEUS", "label": "Prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" }], "__requires": [{ "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" }, { "type": "panel", "id": "text", "name": "Text", "version": "" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "3.1.0" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }], "id": null, "title": "Prometheus Stats", "tags": [], "style": "dark", "timezone": "browser", "editable": true, "hideControls": true, "sharedCrosshair": false, "rows": [{ "collapse": false, "editable": true, "height": 178, "panels": [{ "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], "datasource": "${DS_PROMETHEUS}", "decimals": 1, "editable": true, "error": false, "format": "s", "id": 5, "interval": null, "links": [], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "(time() - container_start_time_seconds{container_name=\"kube-apiserver\"})", "intervalFactor": 2, "refId": "A", "step": 4 }], "thresholds": "", "title": "Uptime", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current", "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "rangeMaps": [{ "from": "null", "to": "null", "text": "N/A" }], "mappingType": 1, "gauge": { "show": false, "minValue": 0, "maxValue": 100, "thresholdMarkers": true, "thresholdLabels": false } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "id": 6, "interval": null, "links": [], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": true }, "targets": [{ "expr": "prometheus_local_storage_memory_series", "intervalFactor": 2, "refId": "A", "step": 4 }], "thresholds": "1,5", "title": "Local Storage Memory Series", "type": "singlestat", "valueFontSize": "70%", "valueMaps": [], "valueName": "current", "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "rangeMaps": [{ "from": "null", "to": "null", "text": "N/A" }], "mappingType": 1, "gauge": { "show": false, "minValue": 0, "maxValue": 100, "thresholdMarkers": true, "thresholdLabels": false } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "none", "id": 7, "interval": null, "links": [], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": true }, "targets": [{ "expr": "prometheus_local_storage_indexing_queue_length", "intervalFactor": 2, "refId": "A", "step": 4 }], "thresholds": "500,4000", "title": "Internal Storage Queue Length", "type": "singlestat", "valueFontSize": "70%", "valueMaps": [{ "op": "=", "text": "Empty", "value": "0" }], "valueName": "current", "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "rangeMaps": [{ "from": "null", "to": "null", "text": "N/A" }], "mappingType": 1, "gauge": { "show": false, "minValue": 0, "maxValue": 100, "thresholdMarkers": true, "thresholdLabels": false } }, { "content": "\"Prometheus\nPrometheus\n\n

You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the Grafana and Prometheus projects.

", "editable": true, "error": false, "id": 9, "links": [], "mode": "html", "span": 3, "style": {}, "title": "", "transparent": true, "type": "text" }], "title": "New row" }, { "collapse": false, "editable": true, "height": 227, "panels": [{ "aliasColors": { "prometheus": "#C15C17", "{instance=\"localhost:9090\",job=\"prometheus\"}": "#C15C17" }, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 3, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 9, "stack": false, "steppedLine": false, "targets": [{ "expr": "rate(prometheus_local_storage_ingested_samples_total[5m])", "interval": "", "intervalFactor": 2, "legendFormat": "{{job}}", "metric": "", "refId": "A", "step": 2 }], "timeFrom": null, "timeShift": null, "title": "Samples ingested (rate-5m)", "tooltip": { "shared": true, "value_type": "cumulative", "ordering": "alphabetical", "msResolution": false }, "type": "graph", "yaxes": [{ "show": true, "min": null, "max": null, "logBase": 1, "format": "short" }, { "show": true, "min": null, "max": null, "logBase": 1, "format": "short" }], "xaxis": { "show": true } }, { "content": "#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ", "editable": true, "error": false, "id": 8, "links": [], "mode": "markdown", "span": 2.995914043583536, "style": {}, "title": "", "transparent": true, "type": "text" }], "title": "New row" }, { "collapse": false, "editable": true, "height": "250px", "panels": [{ "aliasColors": { "prometheus": "#F9BA8F", "{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}": "#F9BA8F" }, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 2, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 5, "stack": false, "steppedLine": false, "targets": [{ "expr": "rate(prometheus_target_interval_length_seconds_count[5m])", "intervalFactor": 2, "legendFormat": "{{job}}", "refId": "A", "step": 2 }], "timeFrom": null, "timeShift": null, "title": "Target Scrapes (last 5m)", "tooltip": { "shared": true, "value_type": "cumulative", "ordering": "alphabetical", "msResolution": false }, "type": "graph", "yaxes": [{ "show": true, "min": null, "max": null, "logBase": 1, "format": "short" }, { "show": true, "min": null, "max": null, "logBase": 1, "format": "short" }], "xaxis": { "show": true } }, { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 14, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 4, "stack": false, "steppedLine": false, "targets": [{ "expr": "prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", "interval": "", "intervalFactor": 2, "legendFormat": "{{quantile}} ({{interval}})", "metric": "", "refId": "A", "step": 2 }], "timeFrom": null, "timeShift": null, "title": "Scrape Duration", "tooltip": { "shared": true, "value_type": "cumulative", "ordering": "alphabetical", "msResolution": false }, "type": "graph", "yaxes": [{ "show": true, "min": null, "max": null, "logBase": 1, "format": "short" }, { "show": true, "min": null, "max": null, "logBase": 1, "format": "short" }], "xaxis": { "show": true } }, { "content": "#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ", "editable": true, "error": false, "id": 11, "links": [], "mode": "markdown", "span": 3, "style": {}, "title": "", "transparent": true, "type": "text" }], "title": "New row" }, { "collapse": false, "editable": true, "height": "250px", "panels": [{ "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "decimals": null, "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 12, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 9, "stack": false, "steppedLine": false, "targets": [{ "expr": "prometheus_evaluator_duration_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", "interval": "", "intervalFactor": 2, "legendFormat": "{{quantile}}", "refId": "A", "step": 2 }], "timeFrom": null, "timeShift": null, "title": "Rule Eval Duration", "tooltip": { "shared": true, "value_type": "cumulative", "ordering": "alphabetical", "msResolution": false }, "type": "graph", "yaxes": [{ "show": true, "min": null, "max": null, "logBase": 1, "format": "percentunit", "label": "" }, { "show": true, "min": null, "max": null, "logBase": 1, "format": "short" }], "xaxis": { "show": true } }, { "content": "#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.", "editable": true, "error": false, "id": 15, "links": [], "mode": "markdown", "span": 3, "style": {}, "title": "", "transparent": true, "type": "text" }], "title": "New row" }], "time": { "from": "now-5m", "to": "now" }, "timepicker": { "now": true, "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] }, "templating": { "list": [] }, "annotations": { "list": [] }, "refresh": false, "schemaVersion": 12, "version": 0, "links": [{ "icon": "info", "tags": [], "targetBlank": true, "title": "Grafana Docs", "tooltip": "", "type": "link", "url": "http://www.grafana.org/docs" }, { "icon": "info", "tags": [], "targetBlank": true, "title": "Prometheus Docs", "type": "link", "url": "http://prometheus.io/docs/introduction/overview/" }], "gnetId": 2, "description": "The official, pre-built Prometheus Stats Dashboard." } grafana-net-737-dashboard.json: | { "__inputs": [{ "name": "DS_PROMETHEUS", "label": "prometheus", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" }], "__requires": [{ "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "3.1.0" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }], "id": null, "title": "Kubernetes Pod Resources", "description": "Shows resource usage of Kubernetes pods.", "tags": [ "kubernetes" ], "style": "dark", "timezone": "browser", "editable": true, "hideControls": false, "sharedCrosshair": false, "rows": [{ "collapse": false, "editable": true, "height": "250px", "panels": [{ "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "height": "180px", "id": 4, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum (container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum (machine_memory_bytes{instance=~\"^$instance$\"}) * 100", "interval": "", "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 2 }], "thresholds": "65, 90", "timeFrom": "1m", "timeShift": null, "title": "Memory Working Set", "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "height": "180px", "id": 6, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum(rate(container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m])) / sum (machine_cpu_cores{instance=~\"^$instance$\"}) * 100", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 }], "thresholds": "65, 90", "timeFrom": "1m", "timeShift": null, "title": "Cpu Usage", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "height": "180px", "id": 7, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum(container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"}) * 100", "interval": "10s", "intervalFactor": 1, "legendFormat": "", "metric": "", "refId": "A", "step": 10 }], "thresholds": "65, 90", "timeFrom": "1m", "timeShift": null, "title": "Filesystem Usage", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "hideTimeOverride": true, "id": 9, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "20%", "prefix": "", "prefixFontSize": "20%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum(container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 }], "thresholds": "", "timeFrom": "1m", "title": "Used", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "hideTimeOverride": true, "id": 10, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum (machine_memory_bytes{instance=~\"^$instance$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 }], "thresholds": "", "timeFrom": "1m", "title": "Total", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "hideTimeOverride": true, "id": 11, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": " cores", "postfixFontSize": "30%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m]))", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 }], "thresholds": "", "timeFrom": "1m", "timeShift": null, "title": "Used", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "hideTimeOverride": true, "id": 12, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": " cores", "postfixFontSize": "30%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum (machine_cpu_cores{instance=~\"^$instance$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 }], "thresholds": "", "timeFrom": "1m", "title": "Total", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "hideTimeOverride": true, "id": 13, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 }], "thresholds": "", "timeFrom": "1m", "title": "Used", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "height": "1px", "hideTimeOverride": true, "id": 14, "interval": null, "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [{ "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 }], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [{ "from": "null", "text": "N/A", "to": "null" }], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "targets": [{ "expr": "sum (container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", "step": 10 }], "thresholds": "", "timeFrom": "1m", "title": "Total", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [{ "op": "=", "text": "N/A", "value": "null" }], "valueName": "current" }, { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)", "thresholdLine": false }, "height": "200px", "id": 32, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [{ "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", "interval": "", "intervalFactor": 2, "legendFormat": "receive", "metric": "network", "refId": "A", "step": 240 }, { "expr": "- sum(rate(container_network_transmit_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", "interval": "", "intervalFactor": 2, "legendFormat": "transmit", "metric": "network", "refId": "B", "step": 240 }], "timeFrom": null, "timeShift": null, "title": "Network", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "transparent": false, "type": "graph", "xaxis": { "show": true }, "yaxes": [{ "format": "Bps", "label": "transmit / receive", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": false }] }], "showTitle": true, "title": "all pods" }, { "collapse": false, "editable": true, "height": "250px", "panels": [{ "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "decimals": 3, "editable": true, "error": false, "fill": 0, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "height": "", "id": 17, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [{ "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", "interval": "", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", "metric": "container_cpu", "refId": "A", "step": 240 }], "timeFrom": null, "timeShift": null, "title": "Cpu Usage", "tooltip": { "msResolution": true, "shared": false, "sort": 2, "value_type": "cumulative" }, "transparent": false, "type": "graph", "xaxis": { "show": true }, "yaxes": [{ "format": "none", "label": "cores", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false }] }, { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "fill": 0, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 33, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [{ "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", "interval": "", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", "metric": "", "refId": "A", "step": 240 }], "timeFrom": null, "timeShift": null, "title": "Memory Working Set", "tooltip": { "msResolution": false, "shared": false, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "show": true }, "yaxes": [{ "format": "bytes", "label": "used", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false }] }, { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 16, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [{ "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", "interval": "", "intervalFactor": 2, "legendFormat": "{{ pod_name }} < in", "metric": "network", "refId": "A", "step": 240 }, { "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", "interval": "", "intervalFactor": 2, "legendFormat": "{{ pod_name }} > out", "metric": "network", "refId": "B", "step": 240 }], "timeFrom": null, "timeShift": null, "title": "Network", "tooltip": { "msResolution": false, "shared": false, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "show": true }, "yaxes": [{ "format": "Bps", "label": "transmit / receive", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false }] }, { "aliasColors": {}, "bars": false, "datasource": "${DS_PROMETHEUS}", "decimals": 2, "editable": true, "error": false, "fill": 1, "grid": { "threshold1": null, "threshold1Color": "rgba(216, 200, 27, 0.27)", "threshold2": null, "threshold2Color": "rgba(234, 112, 112, 0.22)" }, "id": 34, "isNew": true, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 12, "stack": false, "steppedLine": false, "targets": [{ "expr": "sum(container_fs_usage_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", "interval": "", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", "metric": "network", "refId": "A", "step": 240 }], "timeFrom": null, "timeShift": null, "title": "Filesystem", "tooltip": { "msResolution": false, "shared": false, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "show": true }, "yaxes": [{ "format": "bytes", "label": "used", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false }] }], "showTitle": true, "title": "each pod" }], "time": { "from": "now-3d", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "templating": { "list": [{ "allValue": ".*", "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": true, "label": "Instance", "multi": false, "name": "instance", "options": [], "query": "label_values(instance)", "refresh": 1, "regex": "", "type": "query" }, { "current": {}, "datasource": "${DS_PROMETHEUS}", "hide": 0, "includeAll": true, "label": "Namespace", "multi": true, "name": "namespace", "options": [], "query": "label_values(namespace)", "refresh": 1, "regex": "", "type": "query" }] }, "annotations": { "list": [] }, "refresh": false, "schemaVersion": 12, "version": 8, "links": [], "gnetId": 737 } prometheus-datasource.json: | { "name": "prometheus", "type": "prometheus", "url": "http://prometheus:9090", "access": "proxy", "basicAuth": false } kind: ConfigMap metadata: creationTimestamp: null name: grafana-import-dashboards namespace: monitoring --- apiVersion: batch/v1 kind: Job metadata: name: grafana-import-dashboards namespace: monitoring labels: app: grafana component: import-dashboards spec: template: metadata: name: grafana-import-dashboards labels: app: grafana component: import-dashboards spec: serviceAccountName: prometheus-k8s initContainers: - name: wait-for-grafana image: giantswarm/tiny-tools args: - /bin/sh - -c - > set -x; while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do echo '.' sleep 15; done containers: - name: grafana-import-dashboards image: giantswarm/tiny-tools command: ["/bin/sh", "-c"] workingDir: /opt/grafana-import-dashboards args: - > for file in *-datasource.json ; do if [ -e "$file" ] ; then echo "importing $file" && curl --silent --fail --show-error \ --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \ --header "Content-Type: application/json" \ --data-binary "@$file" ; echo "" ; fi done ; for file in *-dashboard.json ; do if [ -e "$file" ] ; then echo "importing $file" && ( echo '{"dashboard":'; \ cat "$file"; \ echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \ | jq -c '.' \ | curl --silent --fail --show-error \ --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \ --header "Content-Type: application/json" \ --data-binary "@-" ; echo "" ; fi done env: - name: GF_ADMIN_USER valueFrom: secretKeyRef: name: grafana key: admin-username - name: GF_ADMIN_PASSWORD valueFrom: secretKeyRef: name: grafana key: admin-password volumeMounts: - name: config-volume mountPath: /opt/grafana-import-dashboards restartPolicy: Never volumes: - name: config-volume configMap: name: grafana-import-dashboards --- # apiVersion: extensions/v1beta1 # kind: Ingress # metadata: # name: grafana # namespace: monitoring # spec: # rules: # - host: ..k8s.gigantic.io # http: # paths: # - path: / # backend: # serviceName: grafana # servicePort: 3000 --- apiVersion: v1 kind: Secret data: admin-password: YWRtaW4= admin-username: YWRtaW4= metadata: name: grafana namespace: monitoring type: Opaque --- apiVersion: v1 kind: Service metadata: name: grafana namespace: monitoring labels: app: grafana component: core spec: type: NodePort ports: - port: 3000 selector: app: grafana component: core --- apiVersion: v1 data: prometheus.yaml: | global: scrape_interval: 10s scrape_timeout: 10s evaluation_interval: 10s rule_files: - "/etc/prometheus-rules/*.rules" scrape_configs: # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37 - job_name: 'kubernetes-nodes' tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node relabel_configs: - source_labels: [__address__] regex: '(.*):10250' replacement: '${1}:10255' target_label: __address__ # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79 - job_name: 'kubernetes-endpoints' kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: (.+)(?::\d+);(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119 - job_name: 'kubernetes-services' metrics_path: /probe params: module: [http_2xx] kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: blackbox - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_name # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156 - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: (.+):(?:\d+);(\d+) replacement: ${1}:${2} target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name - source_labels: [__meta_kubernetes_pod_container_port_number] action: keep regex: 9\d{3} - job_name: 'kubernetes-cadvisor' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor kind: ConfigMap metadata: creationTimestamp: null name: prometheus-core namespace: monitoring --- apiVersion: apps/v1 kind: Deployment metadata: name: prometheus-core namespace: monitoring labels: app: prometheus component: core spec: replicas: 1 selector: matchLabels: app: prometheus template: metadata: name: prometheus-main labels: app: prometheus component: core spec: serviceAccountName: prometheus-k8s containers: - name: prometheus image: prom/prometheus:v1.7.0 args: - '-storage.local.retention=12h' - '-storage.local.memory-chunks=500000' - '-config.file=/etc/prometheus/prometheus.yaml' - '-alertmanager.url=http://alertmanager:9093/' ports: - name: webui containerPort: 9090 resources: requests: cpu: 500m memory: 500M limits: cpu: 500m memory: 500M volumeMounts: - name: config-volume mountPath: /etc/prometheus - name: rules-volume mountPath: /etc/prometheus-rules volumes: - name: config-volume configMap: name: prometheus-core - name: rules-volume configMap: name: prometheus-rules --- apiVersion: apps/v1 kind: Deployment metadata: name: kube-state-metrics namespace: monitoring labels: app: kube-state-metrics spec: replicas: 1 selector: matchLabels: app: kube-state-metrics template: metadata: labels: app: kube-state-metrics spec: serviceAccountName: kube-state-metrics containers: - name: kube-state-metrics image: gcr.io/google_containers/kube-state-metrics:v0.5.0 ports: - containerPort: 8080 --- # --- # apiVersion: rbac.authorization.k8s.io/v1beta1 # kind: ClusterRoleBinding # metadata: # name: kube-state-metrics # roleRef: # apiGroup: rbac.authorization.k8s.io # kind: ClusterRole # name: kube-state-metrics # subjects: # - kind: ServiceAccount # name: kube-state-metrics # namespace: monitoring # --- # apiVersion: rbac.authorization.k8s.io/v1beta1 # kind: ClusterRole # metadata: # name: kube-state-metrics # rules: # - apiGroups: [""] # resources: # - nodes # - pods # - services # - resourcequotas # - replicationcontrollers # - limitranges # verbs: ["list", "watch"] # - apiGroups: ["apps"] # resources: # - daemonsets # - deployments # - replicasets # verbs: ["list", "watch"] # --- apiVersion: v1 kind: ServiceAccount metadata: name: kube-state-metrics namespace: monitoring --- apiVersion: v1 kind: Service metadata: annotations: prometheus.io/scrape: 'true' name: kube-state-metrics namespace: monitoring labels: app: kube-state-metrics spec: ports: - name: kube-state-metrics port: 8080 protocol: TCP selector: app: kube-state-metrics --- apiVersion: apps/v1 kind: DaemonSet metadata: name: node-directory-size-metrics namespace: monitoring labels: app: node-directory-size-metrics annotations: description: | This `DaemonSet` provides metrics in Prometheus format about disk usage on the nodes. The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. The other container `caddy` just hands out the contents of that file on request via `http` on `/metrics` at port `9102` which are the defaults for Prometheus. These are scheduled on every node in the Kubernetes cluster. To choose directories from the node to check, just mount them on the `read-du` container below `/mnt`. spec: selector: matchLabels: app: node-directory-size-metrics template: metadata: labels: app: node-directory-size-metrics annotations: prometheus.io/scrape: 'true' prometheus.io/port: '9102' description: | This `Pod` provides metrics in Prometheus format about disk usage on the node. The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. The other container `caddy` just hands out the contents of that file on request on `/metrics` at port `9102` which are the defaults for Prometheus. This `Pod` is scheduled on every node in the Kubernetes cluster. To choose directories from the node to check just mount them on `read-du` below `/mnt`. spec: containers: - name: read-du image: giantswarm/tiny-tools imagePullPolicy: Always # FIXME threshold via env var # The command: - fish - --command - | touch /tmp/metrics-temp while true for directory in (du --bytes --separate-dirs --threshold=100M /mnt) echo $directory | read size path echo "node_directory_size_bytes{path=\"$path\"} $size" \ >> /tmp/metrics-temp end mv /tmp/metrics-temp /tmp/metrics sleep 300 end volumeMounts: - name: host-fs-var mountPath: /mnt/var readOnly: true - name: metrics mountPath: /tmp - name: caddy image: dockermuenster/caddy:0.9.3 command: - "caddy" - "-port=9102" - "-root=/var/www" ports: - containerPort: 9102 volumeMounts: - name: metrics mountPath: /var/www volumes: - name: host-fs-var hostPath: path: /var - name: metrics emptyDir: medium: Memory --- apiVersion: apps/v1 kind: DaemonSet metadata: name: prometheus-node-exporter namespace: monitoring labels: app: prometheus component: node-exporter spec: selector: matchLabels: app: prometheus template: metadata: name: prometheus-node-exporter labels: app: prometheus component: node-exporter spec: containers: - image: prom/node-exporter:v0.14.0 name: prometheus-node-exporter ports: - name: prom-node-exp #^ must be an IANA_SVC_NAME (at most 15 characters, ..) containerPort: 9100 hostPort: 9100 hostNetwork: true hostPID: true --- apiVersion: v1 kind: Service metadata: annotations: prometheus.io/scrape: 'true' name: prometheus-node-exporter namespace: monitoring labels: app: prometheus component: node-exporter spec: clusterIP: None ports: - name: prometheus-node-exporter port: 9100 protocol: TCP selector: app: prometheus component: node-exporter type: ClusterIP --- apiVersion: v1 data: cpu-usage.rules: | ALERT NodeCPUUsage IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 FOR 2m LABELS { severity="page" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: High CPU usage detected", DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" } instance-availability.rules: | ALERT InstanceDown IF up == 0 FOR 1m LABELS { severity = "page" } ANNOTATIONS { summary = "Instance {{ $labels.instance }} down", description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.", } low-disk-space.rules: | ALERT NodeLowRootDisk IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75 FOR 2m LABELS { severity="page" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: Low root disk space", DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})" } ALERT NodeLowDataDisk IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75 FOR 2m LABELS { severity="page" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: Low data disk space", DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})" } mem-usage.rules: | ALERT NodeSwapUsage IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 FOR 2m LABELS { severity="page" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: Swap usage detected", DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" } ALERT NodeMemoryUsage IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75 FOR 2m LABELS { severity="page" } ANNOTATIONS { SUMMARY = "{{$labels.instance}}: High memory usage detected", DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" } kind: ConfigMap metadata: creationTimestamp: null name: prometheus-rules namespace: monitoring --- apiVersion: v1 kind: Service metadata: name: prometheus namespace: monitoring labels: app: prometheus component: core annotations: prometheus.io/scrape: 'true' spec: type: NodePort ports: - port: 9090 protocol: TCP name: webui selector: app: prometheus component: core