--- # create amazon-cloudwatch namespace apiVersion: v1 kind: Namespace metadata: name: amazon-cloudwatch labels: name: amazon-cloudwatch --- apiVersion: v1 kind: ServiceAccount metadata: name: cloudwatch-agent namespace: amazon-cloudwatch --- apiVersion: v1 kind: ServiceAccount metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: amazon-cloudwatch-observability-controller-manager namespace: amazon-cloudwatch --- apiVersion: v1 kind: Secret metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: "amazon-cloudwatch-observability-agent-cert" namespace: amazon-cloudwatch --- apiVersion: v1 kind: ConfigMap metadata: name: fluent-bit-config namespace: amazon-cloudwatch labels: k8s-app: fluent-bit data: fluent-bit.conf: | [SERVICE] Flush 5 Grace 30 Log_Level error Daemon off Parsers_File parsers.conf storage.path /var/fluent-bit/state/flb-storage/ storage.sync normal storage.checksum off storage.backlog.mem_limit 5M @INCLUDE application-log.conf @INCLUDE dataplane-log.conf @INCLUDE host-log.conf application-log.conf: | [INPUT] Name tail Tag application.* Exclude_Path /var/log/containers/cloudwatch-agent*, /var/log/containers/fluent-bit*, /var/log/containers/aws-node*, /var/log/containers/kube-proxy*, /var/log/containers/fluentd* Path /var/log/containers/*.log multiline.parser docker, cri DB /var/fluent-bit/state/flb_container.db Mem_Buf_Limit 50MB Skip_Long_Lines On Refresh_Interval 10 Rotate_Wait 30 storage.type filesystem Read_from_Head ${READ_FROM_HEAD} [INPUT] Name tail Tag application.* Path /var/log/containers/fluent-bit* multiline.parser docker, cri DB /var/fluent-bit/state/flb_log.db Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [INPUT] Name tail Tag application.* Path /var/log/containers/cloudwatch-agent* multiline.parser docker, cri DB /var/fluent-bit/state/flb_cwagent.db Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [FILTER] Name kubernetes Match application.* Kube_URL https://kubernetes.default.svc:443 Kube_Tag_Prefix application.var.log.containers. Merge_Log On Merge_Log_Key log_processed K8S-Logging.Parser On K8S-Logging.Exclude Off Labels Off Annotations Off Use_Kubelet On Kubelet_Port 10250 Buffer_Size 0 [OUTPUT] Name cloudwatch_logs Match application.* region ${AWS_REGION} log_group_name /aws/containerinsights/${CLUSTER_NAME}/application log_stream_prefix ${HOST_NAME}- auto_create_group true extra_user_agent container-insights dataplane-log.conf: | [INPUT] Name systemd Tag dataplane.systemd.* Systemd_Filter _SYSTEMD_UNIT=docker.service Systemd_Filter _SYSTEMD_UNIT=containerd.service Systemd_Filter _SYSTEMD_UNIT=kubelet.service DB /var/fluent-bit/state/systemd.db Path /var/log/journal Read_From_Tail ${READ_FROM_TAIL} [INPUT] Name tail Tag dataplane.tail.* Path /var/log/containers/aws-node*, /var/log/containers/kube-proxy* multiline.parser docker, cri DB /var/fluent-bit/state/flb_dataplane_tail.db Mem_Buf_Limit 50MB Skip_Long_Lines On Refresh_Interval 10 Rotate_Wait 30 storage.type filesystem Read_from_Head ${READ_FROM_HEAD} [FILTER] Name modify Match dataplane.systemd.* Rename _HOSTNAME hostname Rename _SYSTEMD_UNIT systemd_unit Rename MESSAGE message Remove_regex ^((?!hostname|systemd_unit|message).)*$ [FILTER] Name aws Match dataplane.* imds_version v2 [OUTPUT] Name cloudwatch_logs Match dataplane.* region ${AWS_REGION} log_group_name /aws/containerinsights/${CLUSTER_NAME}/dataplane log_stream_prefix ${HOST_NAME}- auto_create_group true extra_user_agent container-insights host-log.conf: | [INPUT] Name tail Tag host.dmesg Path /var/log/dmesg Key message DB /var/fluent-bit/state/flb_dmesg.db Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [INPUT] Name tail Tag host.messages Path /var/log/messages Parser syslog DB /var/fluent-bit/state/flb_messages.db Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [INPUT] Name tail Tag host.secure Path /var/log/secure Parser syslog DB /var/fluent-bit/state/flb_secure.db Mem_Buf_Limit 5MB Skip_Long_Lines On Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [FILTER] Name aws Match host.* imds_version v2 [OUTPUT] Name cloudwatch_logs Match host.* region ${AWS_REGION} log_group_name /aws/containerinsights/${CLUSTER_NAME}/host log_stream_prefix ${HOST_NAME}. auto_create_group true extra_user_agent container-insights parsers.conf: | [PARSER] Name syslog Format regex Regex ^(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$ Time_Key time Time_Format %b %d %H:%M:%S [PARSER] Name container_firstline Format regex Regex (?<log>(?<="log":")\S(?!\.).*?)(?<!\\)".*(?<stream>(?<="stream":").*?)".*(?<time>\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\.\w*).*(?=}) Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%LZ [PARSER] Name cwagent_firstline Format regex Regex (?<log>(?<="log":")\d{4}[\/-]\d{1,2}[\/-]\d{1,2}[ T]\d{2}:\d{2}:\d{2}(?!\.).*?)(?<!\\)".*(?<stream>(?<="stream":").*?)".*(?<time>\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\.\w*).*(?=}) Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%LZ --- apiVersion: v1 kind: ConfigMap metadata: name: fluent-bit-windows-config namespace: amazon-cloudwatch labels: k8s-app: fluent-bit data: fluent-bit.conf: | [SERVICE] Flush 5 Log_Level error Daemon off net.dns.resolver LEGACY Parsers_File parsers.conf @INCLUDE application-log.conf @INCLUDE dataplane-log.conf @INCLUDE host-log.conf application-log.conf: | [INPUT] Name tail Tag application.* Exclude_Path C:\\var\\log\\containers\\fluent-bit*, C:\\var\\log\\containers\\cloudwatch-agent* Path C:\\var\\log\\containers\\*.log Parser docker DB C:\\var\\fluent-bit\\state\\flb_container.db Mem_Buf_Limit 50MB Skip_Long_Lines On Rotate_Wait 30 Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [INPUT] Name tail Tag application.* Path C:\\var\\log\\containers\\fluent-bit* Parser docker DB C:\\var\\fluent-bit\\state\\flb_log.db Mem_Buf_Limit 5MB Skip_Long_Lines On Rotate_Wait 30 Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [INPUT] Name tail Tag application.* Path C:\\var\\log\\containers\\cloudwatch-agent* Parser docker DB C:\\var\\fluent-bit\\state\\flb_cwagent.db Mem_Buf_Limit 5MB Skip_Long_Lines On Rotate_Wait 30 Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [OUTPUT] Name cloudwatch_logs Match application.* region ${AWS_REGION} log_group_name /aws/containerinsights/${CLUSTER_NAME}/application log_stream_prefix ${HOST_NAME}- auto_create_group true extra_user_agent container-insights dataplane-log.conf: | [INPUT] Name tail Tag dataplane.tail.* Path C:\\ProgramData\\containerd\\root\\*.log, C:\\ProgramData\\Amazon\\EKS\\logs\\*.log Parser dataplane_firstline DB C:\\var\\fluent-bit\\state\\flb_dataplane_tail.db Mem_Buf_Limit 5MB Skip_Long_Lines On Rotate_Wait 30 Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [INPUT] Name tail Tag dataplane.tail.C.ProgramData.Amazon.EKS.logs.vpc-bridge Path C:\\ProgramData\\Amazon\\EKS\\logs\\*.log.* Path_Key file_name Parser dataplane_firstline DB C:\\var\\fluent-bit\\state\\flb_dataplane_cni_tail.db Mem_Buf_Limit 5MB Skip_Long_Lines On Rotate_Wait 30 Refresh_Interval 10 Read_from_Head ${READ_FROM_HEAD} [FILTER] Name aws Match dataplane.* imds_version v2 [OUTPUT] Name cloudwatch_logs Match dataplane.* region ${AWS_REGION} log_group_name /aws/containerinsights/${CLUSTER_NAME}/dataplane log_stream_prefix ${HOST_NAME}- auto_create_group true extra_user_agent container-insights host-log.conf: | [INPUT] Name winlog Channels EKS, System DB C:\\var\\fluent-bit\\state\\flb_system_winlog.db Interval_Sec 60 [FILTER] Name aws Match winlog.* imds_version v2 [OUTPUT] Name cloudwatch_logs Match winlog.* region ${AWS_REGION} log_group_name /aws/containerinsights/${CLUSTER_NAME}/host log_stream_prefix ${HOST_NAME}. auto_create_group true extra_user_agent container-insights parsers.conf: | [PARSER] Name docker Format json Time_Key time Time_Format %b %d %H:%M:%S [PARSER] Name container_firstline Format regex Regex (?<log>(?<="log":")\S(?!\.).*?)(?<!\\)".*(?<stream>(?<="stream":").*?)".*(?<time>\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\.\w*).*(?=}) Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%LZ [PARSER] Name dataplane_firstline Format regex Regex (?<log>(?<="log":")\S(?!\.).*?)(?<!\\)".*(?<stream>(?<="stream":").*?)".*(?<time>\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\.\w*).*(?=}) Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%LZ --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: cloudwatch-agent-role rules: - apiGroups: [ "" ] resources: [ "pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints" ] verbs: [ "list", "watch", "get" ] - apiGroups: [ "" ] resources: [ "services" ] verbs: [ "list", "watch" ] - apiGroups: [ "apps" ] resources: [ "replicasets", "daemonsets", "deployments", "statefulsets" ] verbs: [ "list", "watch", "get" ] - apiGroups: [ "batch" ] resources: [ "jobs" ] verbs: [ "list", "watch" ] - apiGroups: [ "" ] resources: [ "nodes/stats", "configmaps", "events" ] verbs: [ "create", "get" ] - apiGroups: [ "" ] resources: [ "configmaps" ] verbs: [ "update" ] - nonResourceURLs: [ "/metrics" ] verbs: [ "get", "list", "watch" ] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: amazon-cloudwatch-observability-manager-role rules: - apiGroups: [ "" ] resources: [ "configmaps" ] verbs: [ "create", "delete", "get", "list", "patch", "update", "watch" ] - apiGroups: [ "" ] resources: [ "events" ] verbs: [ "create", "patch" ] - apiGroups: [ "" ] resources: [ "namespaces" ] verbs: [ "get","list","patch","update","watch" ] - apiGroups: [ "" ] resources: [ "serviceaccounts" ] verbs: [ "create","delete","get","list","patch","update","watch" ] - apiGroups: [ "" ] resources: [ "services" ] verbs: [ "create","delete","get","list","patch","update","watch" ] - apiGroups: [ "apps" ] resources: [ "daemonsets" ] verbs: [ "create","delete","get","list","patch","update","watch" ] - apiGroups: [ "apps" ] resources: [ "deployments" ] verbs: [ "create","delete","get","list","patch","update","watch" ] - apiGroups: [ "apps" ] resources: [ "statefulsets" ] verbs: [ "create","delete","get","list","patch","update","watch" ] - apiGroups: [ "apps" ] resources: [ "replicasets" ] verbs: [ "get","list","watch" ] - apiGroups: [ "cloudwatch.aws.amazon.com" ] resources: [ "amazoncloudwatchagents", "dcgmexporters", "neuronmonitors" ] verbs: [ "get","list","patch","update","watch" ] - apiGroups: [ "cloudwatch.aws.amazon.com" ] resources: [ "amazoncloudwatchagents/finalizers", "dcgmexporters/finalizers", "neuronmonitors/finalizers" ] verbs: [ "get","patch","update" ] - apiGroups: [ "cloudwatch.aws.amazon.com" ] resources: [ "amazoncloudwatchagents/status", "dcgmexporters/status", "neuronmonitors/status" ] verbs: [ "get","patch","update" ] - apiGroups: [ "cloudwatch.aws.amazon.com" ] resources: [ "instrumentations" ] verbs: [ "get","list","patch","update","watch" ] - apiGroups: [ "coordination.k8s.io" ] resources: [ "leases" ] verbs: [ "create","get","list","update" ] - apiGroups: [ "networking.k8s.io" ] resources: [ "ingresses" ] verbs: [ "create","delete","get","list","patch","update","watch" ] - apiGroups: [ "route.openshift.io" ] resources: [ "routes", "routes/custom-host" ] verbs: [ "create","delete","get","list","patch","update","watch" ] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: cloudwatch-agent-role-binding roleRef: kind: ClusterRole name: cloudwatch-agent-role apiGroup: rbac.authorization.k8s.io subjects: - kind: ServiceAccount name: cloudwatch-agent namespace: amazon-cloudwatch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: amazon-cloudwatch-observability-manager-rolebinding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: amazon-cloudwatch-observability-manager-role subjects: - kind: ServiceAccount name: amazon-cloudwatch-observability-controller-manager namespace: amazon-cloudwatch --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: "dcgm-exporter-role" namespace: amazon-cloudwatch labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" rules: - apiGroups: [""] resources: ["configmaps"] resourceNames: ["dcgm-exporter-config-map"] verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: "neuron-monitor-role" namespace: amazon-cloudwatch labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" rules: - apiGroups: [""] resources: ["configmaps"] resourceNames: ["neuron-monitor-config-map"] verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: namespace: amazon-cloudwatch name: dcgm-exporter-role-binding labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" roleRef: kind: Role name: "dcgm-exporter-role" apiGroup: rbac.authorization.k8s.io subjects: - kind: ServiceAccount name: dcgm-exporter-service-acct namespace: amazon-cloudwatch --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: namespace: amazon-cloudwatch name: neuron-monitor-role-binding labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" roleRef: kind: Role name: "neuron-monitor-role" apiGroup: rbac.authorization.k8s.io subjects: - kind: ServiceAccount name: neuron-monitor-service-acct namespace: amazon-cloudwatch --- apiVersion: v1 kind: Service metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch spec: ports: - port: 443 protocol: TCP targetPort: 9443 selector: app.kubernetes.io/name: amazon-cloudwatch-observability control-plane: controller-manager --- apiVersion: apps/v1 kind: DaemonSet metadata: name: fluent-bit namespace: amazon-cloudwatch labels: k8s-app: fluent-bit version: v1 kubernetes.io/cluster-service: "true" spec: selector: matchLabels: k8s-app: fluent-bit template: metadata: annotations: checksum/config: 1356b1d704d353a90c127f6dad453991f51d88ae994a7583c1064e0c883d898e labels: k8s-app: fluent-bit version: v1 kubernetes.io/cluster-service: "true" spec: containers: - name: fluent-bit image: public.ecr.aws/aws-observability/aws-for-fluent-bit:2.32.0.20240304 imagePullPolicy: Always env: - name: AWS_REGION value: {{region_name}} - name: CLUSTER_NAME value: "{{cluster_name}}" - name: READ_FROM_HEAD value: "Off" - name: READ_FROM_TAIL value: "On" - name: HOST_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: HOSTNAME valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.name - name: CI_VERSION value: "k8s/1.3.31" resources: limits: cpu: 500m memory: 250Mi requests: cpu: 50m memory: 25Mi volumeMounts: # Please don't change below read-only permissions - name: fluentbitstate mountPath: /var/fluent-bit/state - name: varlog mountPath: /var/log readOnly: true - name: varlibdockercontainers mountPath: /var/lib/docker/containers readOnly: true - name: fluent-bit-config mountPath: /fluent-bit/etc/ - name: runlogjournal mountPath: /run/log/journal readOnly: true - name: dmesg mountPath: /var/log/dmesg readOnly: true terminationGracePeriodSeconds: 10 hostNetwork: true dnsPolicy: ClusterFirstWithHostNet volumes: - name: fluentbitstate hostPath: path: /var/fluent-bit/state - name: varlog hostPath: path: /var/log - name: varlibdockercontainers hostPath: path: /var/lib/docker/containers - name: fluent-bit-config configMap: name: fluent-bit-config - name: runlogjournal hostPath: path: /run/log/journal - name: dmesg hostPath: path: /var/log/dmesg serviceAccountName: cloudwatch-agent nodeSelector: kubernetes.io/os: linux --- apiVersion: apps/v1 kind: DaemonSet metadata: name: fluent-bit-windows namespace: amazon-cloudwatch labels: k8s-app: fluent-bit version: v1 kubernetes.io/cluster-service: "true" spec: selector: matchLabels: k8s-app: fluent-bit template: metadata: annotations: checksum/config: a54dc0c777b3caf8ea8c5e895f9e6054af9b06c72bed9d012c4414165bc85a41 labels: k8s-app: fluent-bit version: v1 kubernetes.io/cluster-service: "true" spec: securityContext: windowsOptions: hostProcess: true runAsUserName: "NT AUTHORITY\\System" hostNetwork: true nodeSelector: kubernetes.io/os: windows containers: - name: fluent-bit image: public.ecr.aws/aws-observability/aws-for-fluent-bit:2.31.12-windowsservercore imagePullPolicy: Always command: ["powershell.exe", "-Command", "New-Item -ItemType Directory -Path C:\\var\\fluent-bit\\state -Force;", "%CONTAINER_SANDBOX_MOUNT_POINT%/fluent-bit/bin/fluent-bit.exe", "-e", "%CONTAINER_SANDBOX_MOUNT_POINT%/fluent-bit/kinesis.dll", "-e", "%CONTAINER_SANDBOX_MOUNT_POINT%/fluent-bit/firehose.dll", "-e", "%CONTAINER_SANDBOX_MOUNT_POINT%/fluent-bit/cloudwatch.dll", "-c", "%CONTAINER_SANDBOX_MOUNT_POINT%/fluent-bit/configuration/fluent-bit.conf"] env: - name: AWS_REGION value: {{region_name}} - name: CLUSTER_NAME value: "{{cluster_name}}" - name: READ_FROM_HEAD value: "Off" - name: HOST_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: HOSTNAME valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.name - name: CI_VERSION value: "k8s/1.3.31" resources: limits: cpu: 500m memory: 600Mi requests: cpu: 300m memory: 300Mi volumeMounts: - name: fluent-bit-config mountPath: fluent-bit\configuration\ volumes: - name: fluent-bit-config configMap: name: fluent-bit-windows-config terminationGracePeriodSeconds: 10 dnsPolicy: ClusterFirstWithHostNet serviceAccountName: cloudwatch-agent --- apiVersion: apps/v1 kind: Deployment metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" control-plane: controller-manager name: amazon-cloudwatch-observability-controller-manager namespace: amazon-cloudwatch spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: amazon-cloudwatch-observability control-plane: controller-manager template: metadata: annotations: labels: app.kubernetes.io/name: amazon-cloudwatch-observability control-plane: controller-manager spec: containers: - image: public.ecr.aws/cloudwatch-agent/cloudwatch-agent-operator:1.3.0 args: - "--auto-annotation-config={\"java\":{\"daemonsets\":[],\"deployments\":[],\"namespaces\":[],\"statefulsets\":[]},\"python\":{\"daemonsets\":[],\"deployments\":[],\"namespaces\":[],\"statefulsets\":[]}}" - "--auto-instrumentation-java-image=public.ecr.aws/aws-observability/adot-autoinstrumentation-java:v1.32.1" - "--auto-instrumentation-python-image=public.ecr.aws/aws-observability/adot-autoinstrumentation-python:v0.1.0" - "--feature-gates=operator.autoinstrumentation.multi-instrumentation,operator.autoinstrumentation.multi-instrumentation.skip-container-validation" command: - /manager name: manager ports: - containerPort: 9443 name: webhook-server protocol: TCP resources: requests: cpu: 100m memory: 64Mi volumeMounts: - mountPath: /tmp/k8s-webhook-server/serving-certs name: cert readOnly: true serviceAccountName: amazon-cloudwatch-observability-controller-manager terminationGracePeriodSeconds: 10 volumes: - name: cert secret: defaultMode: 420 secretName: amazon-cloudwatch-observability-controller-manager-service-cert nodeSelector: kubernetes.io/os: linux --- apiVersion: cloudwatch.aws.amazon.com/v1alpha1 kind: AmazonCloudWatchAgent metadata: name: cloudwatch-agent namespace: amazon-cloudwatch spec: image: public.ecr.aws/cloudwatch-agent/cloudwatch-agent:1.300052.0b1024 mode: daemonset nodeSelector: kubernetes.io/os: linux serviceAccount: cloudwatch-agent config: "{\"agent\":{\"region\":\"{{region_name}}\"},\"logs\":{\"metrics_collected\":{\"kubernetes\":{\"cluster_name\":\"{{cluster_name}}\",\"enhanced_container_insights\":true}}}}" resources: requests: memory: "128Mi" cpu: "250m" limits: memory: "512Mi" cpu: "500m" volumeMounts: - mountPath: /rootfs name: rootfs readOnly: true - mountPath: /var/run/docker.sock name: dockersock readOnly: true - mountPath: /run/containerd/containerd.sock name: containerdsock - mountPath: /var/lib/docker name: varlibdocker readOnly: true - mountPath: /sys name: sys readOnly: true - mountPath: /dev/disk name: devdisk readOnly: true - mountPath: /etc/amazon-cloudwatch-observability-agent-cert name: agenttls readOnly: true - mountPath: /var/lib/kubelet/pod-resources name: kubelet-podresources volumes: - name: kubelet-podresources hostPath: path: /var/lib/kubelet/pod-resources type: Directory - name: rootfs hostPath: path: / - hostPath: path: /var/run/docker.sock name: dockersock - hostPath: path: /var/lib/docker name: varlibdocker - hostPath: path: /run/containerd/containerd.sock name: containerdsock - hostPath: path: /sys name: sys - hostPath: path: /dev/disk/ name: devdisk - name: agenttls secret: secretName: amazon-cloudwatch-observability-agent-cert items: - key: ca.crt path: tls-ca.crt env: - name: K8S_NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: HOST_IP valueFrom: fieldRef: fieldPath: status.hostIP - name: HOST_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: K8S_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace --- apiVersion: cloudwatch.aws.amazon.com/v1alpha1 kind: AmazonCloudWatchAgent metadata: name: cloudwatch-agent-windows namespace: amazon-cloudwatch spec: podSecurityContext: windowsOptions: hostProcess: true runAsUserName: "NT AUTHORITY\\System" hostNetwork: true image: public.ecr.aws/cloudwatch-agent/cloudwatch-agent:1.300052.0b1024 mode: daemonset serviceAccount: cloudwatch-agent nodeSelector: kubernetes.io/os: windows config: "{\"logs\":{\"metrics_collected\":{\"kubernetes\":{\"enhanced_container_insights\":true}}}}" resources: requests: memory: "128Mi" cpu: "250m" limits: memory: "512Mi" cpu: "500m" env: - name: K8S_NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: HOST_IP valueFrom: fieldRef: fieldPath: status.hostIP - name: HOST_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: K8S_NAMESPACE valueFrom: fieldRef: fieldPath: metadata.namespace - name: RUN_IN_CONTAINER value: "True" - name: RUN_AS_HOST_PROCESS_CONTAINER value: "True" --- apiVersion: cert-manager.io/v1 kind: Certificate metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: amazon-cloudwatch-observability-serving-cert namespace: amazon-cloudwatch spec: dnsNames: - amazon-cloudwatch-observability-webhook-service.amazon-cloudwatch - amazon-cloudwatch-observability-webhook-service.amazon-cloudwatch.svc - amazon-cloudwatch-observability-webhook-service.amazon-cloudwatch.svc.cluster.local issuerRef: kind: Issuer name: amazon-cloudwatch-observability-selfsigned-issuer secretName: amazon-cloudwatch-observability-controller-manager-service-cert subject: organizationalUnits: - amazon-cloudwatch-observability --- apiVersion: cert-manager.io/v1 kind: Certificate metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: "amazon-cloudwatch-observability-agent-cert" namespace: amazon-cloudwatch spec: dnsNames: - "dcgm-exporter-service" - "dcgm-exporter-service.amazon-cloudwatch.svc" - "neuron-monitor-service" - "neuron-monitor-service.amazon-cloudwatch.svc" issuerRef: kind: Issuer name: "agent-ca" secretName: "amazon-cloudwatch-observability-agent-cert" --- apiVersion: cloudwatch.aws.amazon.com/v1alpha1 kind: DcgmExporter metadata: name: dcgm-exporter namespace: amazon-cloudwatch labels: k8s-app: dcgm-exporter version: v1 spec: image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.3-3.3.1-ubuntu22.04 nodeSelector: kubernetes.io/os: linux serviceAccount: dcgm-exporter-service-acct affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: node.kubernetes.io/instance-type operator: In values: - p2.xlarge - p2.8xlarge - p2.16xlarge - p3.2xlarge - p3.8xlarge - p3.16xlarge - p3dn.24xlarge - p4d.24xlarge - p4de.24xlarge - p5.48xlarge - g3s.xlarge - g3.4xlarge - g3.8xlarge - g3.16xlarge - g4dn.xlarge - g4dn.2xlarge - g4dn.4xlarge - g4dn.8xlarge - g4dn.16xlarge - g4dn.12xlarge - g4dn.metal - g4ad.xlarge - g4ad.2xlarge - g4ad.4xlarge - g4ad.8xlarge - g4ad.16xlarge - g5.xlarge - g5.2xlarge - g5.4xlarge - g5.8xlarge - g5.16xlarge - g5.12xlarge - g5.24xlarge - g5.48xlarge - g5g.xlarge - g5g.2xlarge - g5g.4xlarge - g5g.8xlarge - g5g.16xlarge - g5g.metal resources: requests: cpu: 250m memory: 128Mi limits: cpu: 500m memory: 250Mi env: - name: "DCGM_EXPORTER_KUBERNETES" value: "true" - name: "DCGM_EXPORTER_LISTEN" value: ":9400" - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName ports: - name: "metrics" port: 9400 volumeMounts: - name: "pod-gpu-resources" readOnly: true mountPath: "/var/lib/kubelet/pod-resources" - mountPath: /etc/amazon-cloudwatch-observability-dcgm-cert name: dcgmtls readOnly: true volumes: - name: dcgmtls secret: secretName: amazon-cloudwatch-observability-agent-cert items: - key: tls.crt path: server.crt - key: tls.key path: server.key - name: "pod-gpu-resources" hostPath: path: /var/lib/kubelet/pod-resources metricsConfig: | DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). DCGM_FI_DEV_FB_TOTAL, gauge, Framebuffer memory used (in MiB). DCGM_FI_DEV_FB_USED_PERCENT, gauge, Percentage used of Frame Buffer: Used/(Total - Reserved). DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). tlsConfig: | tls_server_config: cert_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.crt key_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.key --- apiVersion: cert-manager.io/v1 kind: Issuer metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: amazon-cloudwatch-observability-selfsigned-issuer namespace: amazon-cloudwatch spec: selfSigned: { } --- apiVersion: cert-manager.io/v1 kind: Issuer metadata: labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: "agent-ca" namespace: amazon-cloudwatch spec: selfSigned: { } --- apiVersion: admissionregistration.k8s.io/v1 kind: MutatingWebhookConfiguration metadata: annotations: cert-manager.io/inject-ca-from: amazon-cloudwatch/amazon-cloudwatch-observability-serving-cert labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: amazon-cloudwatch-observability-mutating-webhook-configuration webhooks: - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /mutate-cloudwatch-aws-amazon-com-v1alpha1-instrumentation failurePolicy: Ignore name: minstrumentation.kb.io rules: - apiGroups: - cloudwatch.aws.amazon.com apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - instrumentations sideEffects: None timeoutSeconds: 10 - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /mutate-cloudwatch-aws-amazon-com-v1alpha1-amazoncloudwatchagent failurePolicy: Ignore name: mamazoncloudwatchagent.kb.io rules: - apiGroups: - cloudwatch.aws.amazon.com apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - amazoncloudwatchagents sideEffects: None timeoutSeconds: 10 - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /mutate-v1-pod failurePolicy: Ignore name: mpod.kb.io rules: - apiGroups: - "" apiVersions: - v1 operations: - CREATE - UPDATE resources: - pods sideEffects: None timeoutSeconds: 10 - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /mutate-v1-namespace failurePolicy: Ignore name: mnamespace.kb.io rules: - apiGroups: - "" apiVersions: - v1 operations: - CREATE - UPDATE resources: - namespaces sideEffects: None timeoutSeconds: 10 - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /mutate-v1-workload failurePolicy: Ignore name: mworkload.kb.io rules: - apiGroups: - apps apiVersions: - v1 operations: - CREATE - UPDATE resources: - daemonsets - deployments - statefulsets sideEffects: None timeoutSeconds: 10 --- apiVersion: cloudwatch.aws.amazon.com/v1alpha1 kind: NeuronMonitor metadata: name: neuron-monitor namespace: amazon-cloudwatch labels: k8s-app: neuron-monitor version: v1 spec: image: public.ecr.aws/neuron/neuron-monitor:1.0.0 serviceAccount: neuron-monitor-service-acct affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/os operator: In values: - linux - key: node.kubernetes.io/instance-type operator: In values: - trn1.2xlarge - trn1.32xlarge - trn1n.32xlarge - inf1.xlarge - inf1.2xlarge - inf1.6xlarge - inf1.24xlarge - inf2.xlarge - inf2.8xlarge - inf2.24xlarge - inf2.48xlarge resources: limits: cpu: 500m memory: 256Mi requests: cpu: 256m memory: 128Mi env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: PATH value: /usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin ports: - name: "metrics" port: 8000 command: - "/opt/bin/entrypoint.sh" args: port: "8000" cert-file: "/etc/amazon-cloudwatch-observability-neuron-cert/server.crt" key-file: "/etc/amazon-cloudwatch-observability-neuron-cert/server.key" securityContext: privileged: true volumeMounts: - mountPath: /etc/amazon-cloudwatch-observability-neuron-cert/ name: neurontls readOnly: true volumes: - name: neurontls secret: secretName: amazon-cloudwatch-observability-agent-cert items: - key: tls.crt path: server.crt - key: tls.key path: server.key monitorConfig: | { "period": "5s", "neuron_runtimes": [ { "tag_filter": ".*", "metrics": [ { "type": "neuroncore_counters" }, { "type": "memory_used" }, { "type": "neuron_runtime_vcpu_usage" }, { "type": "execution_stats" } ] } ], "system_metrics": [ { "type": "memory_info" }, { "period": "5s", "type": "neuron_hw_counters" } ] } --- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: annotations: cert-manager.io/inject-ca-from: amazon-cloudwatch/amazon-cloudwatch-observability-serving-cert labels: app.kubernetes.io/name: amazon-cloudwatch-observability app.kubernetes.io/instance: amazon-cloudwatch-observability app.kubernetes.io/version: "1.0.0" app.kubernetes.io/managed-by: "amazon-cloudwatch-agent-operator" name: amazon-cloudwatch-observability-validating-webhook-configuration webhooks: - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /validate-cloudwatch-aws-amazon-com-v1alpha1-instrumentation failurePolicy: Ignore name: vinstrumentationcreateupdate.kb.io rules: - apiGroups: - cloudwatch.aws.amazon.com apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - instrumentations sideEffects: None timeoutSeconds: 10 - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /validate-cloudwatch-aws-amazon-com-v1alpha1-instrumentation failurePolicy: Ignore name: vinstrumentationdelete.kb.io rules: - apiGroups: - cloudwatch.aws.amazon.com apiVersions: - v1alpha1 operations: - DELETE resources: - instrumentations sideEffects: None timeoutSeconds: 10 - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /validate-cloudwatch-aws-amazon-com-v1alpha1-amazoncloudwatchagent failurePolicy: Ignore name: vamazoncloudwatchagentcreateupdate.kb.io rules: - apiGroups: - cloudwatch.aws.amazon.com apiVersions: - v1alpha1 operations: - CREATE - UPDATE resources: - amazoncloudwatchagents sideEffects: None timeoutSeconds: 10 - admissionReviewVersions: - v1 clientConfig: service: name: amazon-cloudwatch-observability-webhook-service namespace: amazon-cloudwatch path: /validate-cloudwatch-aws-amazon-com-v1alpha1-amazoncloudwatchagent failurePolicy: Ignore name: vamazoncloudwatchagentdelete.kb.io rules: - apiGroups: - cloudwatch.aws.amazon.com apiVersions: - v1alpha1 operations: - DELETE resources: - amazoncloudwatchagents sideEffects: None timeoutSeconds: 10