--- name: senior-devops description: Expert DevOps engineering covering CI/CD pipelines, infrastructure as code, container orchestration, cloud platforms, and site reliability. version: 1.0.0 author: Claude Skills category: engineering tags: [devops, cicd, kubernetes, terraform, aws, docker] --- # Senior DevOps Engineer Expert-level DevOps and platform engineering. ## Core Competencies - CI/CD pipeline design - Infrastructure as Code (IaC) - Container orchestration - Cloud platform management - Monitoring and alerting - Security and compliance - Cost optimization - Incident management ## CI/CD Pipelines ### GitHub Actions **Standard Pipeline:** ```yaml name: CI/CD Pipeline on: push: branches: [main, develop] pull_request: branches: [main] env: NODE_VERSION: '20' REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} jobs: test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: ${{ env.NODE_VERSION }} cache: 'npm' - name: Install dependencies run: npm ci - name: Run linter run: npm run lint - name: Run tests run: npm test -- --coverage - name: Upload coverage uses: codecov/codecov-action@v3 build: needs: test runs-on: ubuntu-latest permissions: contents: read packages: write steps: - uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Login to Container Registry uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push uses: docker/build-push-action@v5 with: context: . push: ${{ github.event_name != 'pull_request' }} tags: | ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }} ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest cache-from: type=gha cache-to: type=gha,mode=max deploy: needs: build if: github.ref == 'refs/heads/main' runs-on: ubuntu-latest environment: production steps: - uses: actions/checkout@v4 - name: Deploy to Kubernetes uses: azure/k8s-deploy@v4 with: manifests: k8s/production/ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }} ``` ### GitLab CI ```yaml stages: - test - build - deploy variables: DOCKER_TLS_CERTDIR: "/certs" test: stage: test image: node:20 cache: paths: - node_modules/ script: - npm ci - npm run lint - npm test build: stage: build image: docker:24 services: - docker:24-dind script: - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY - docker build -t $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA . - docker push $CI_REGISTRY_IMAGE:$CI_COMMIT_SHA deploy:production: stage: deploy image: bitnami/kubectl:latest only: - main script: - kubectl set image deployment/app app=$CI_REGISTRY_IMAGE:$CI_COMMIT_SHA ``` ## Infrastructure as Code ### Terraform **AWS EKS Cluster:** ```hcl terraform { required_version = ">= 1.0" backend "s3" { bucket = "terraform-state-bucket" key = "eks/terraform.tfstate" region = "us-east-1" dynamodb_table = "terraform-locks" encrypt = true } required_providers { aws = { source = "hashicorp/aws" version = "~> 5.0" } } } provider "aws" { region = var.aws_region default_tags { tags = { Environment = var.environment ManagedBy = "terraform" Project = var.project_name } } } module "vpc" { source = "terraform-aws-modules/vpc/aws" version = "~> 5.0" name = "${var.project_name}-vpc" cidr = var.vpc_cidr azs = var.availability_zones private_subnets = var.private_subnet_cidrs public_subnets = var.public_subnet_cidrs enable_nat_gateway = true single_nat_gateway = var.environment != "production" enable_dns_hostnames = true public_subnet_tags = { "kubernetes.io/role/elb" = 1 } private_subnet_tags = { "kubernetes.io/role/internal-elb" = 1 } } module "eks" { source = "terraform-aws-modules/eks/aws" version = "~> 19.0" cluster_name = "${var.project_name}-eks" cluster_version = "1.28" vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets cluster_endpoint_public_access = true eks_managed_node_groups = { main = { min_size = var.node_min_size max_size = var.node_max_size desired_size = var.node_desired_size instance_types = var.node_instance_types capacity_type = "ON_DEMAND" } } } ``` **Variables:** ```hcl variable "aws_region" { type = string default = "us-east-1" } variable "environment" { type = string } variable "project_name" { type = string } variable "vpc_cidr" { type = string default = "10.0.0.0/16" } variable "availability_zones" { type = list(string) default = ["us-east-1a", "us-east-1b", "us-east-1c"] } variable "private_subnet_cidrs" { type = list(string) default = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"] } variable "public_subnet_cidrs" { type = list(string) default = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"] } variable "node_min_size" { type = number default = 2 } variable "node_max_size" { type = number default = 10 } variable "node_desired_size" { type = number default = 3 } variable "node_instance_types" { type = list(string) default = ["t3.medium"] } ``` ## Kubernetes ### Deployment Manifest ```yaml apiVersion: apps/v1 kind: Deployment metadata: name: api labels: app: api spec: replicas: 3 selector: matchLabels: app: api strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 maxUnavailable: 0 template: metadata: labels: app: api spec: serviceAccountName: api containers: - name: api image: ghcr.io/org/api:latest ports: - containerPort: 3000 env: - name: NODE_ENV value: production - name: DATABASE_URL valueFrom: secretKeyRef: name: api-secrets key: database-url resources: requests: cpu: 100m memory: 256Mi limits: cpu: 500m memory: 512Mi livenessProbe: httpGet: path: /health port: 3000 initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: path: /ready port: 3000 initialDelaySeconds: 5 periodSeconds: 5 securityContext: runAsNonRoot: true readOnlyRootFilesystem: true affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 podAffinityTerm: labelSelector: matchLabels: app: api topologyKey: kubernetes.io/hostname --- apiVersion: v1 kind: Service metadata: name: api spec: selector: app: api ports: - port: 80 targetPort: 3000 type: ClusterIP --- apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: api spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: api minReplicas: 3 maxReplicas: 20 metrics: - type: Resource resource: name: cpu target: type: Utilization averageUtilization: 70 ``` ### Helm Chart Structure ``` chart/ ├── Chart.yaml ├── values.yaml ├── values-staging.yaml ├── values-production.yaml ├── templates/ │ ├── _helpers.tpl │ ├── deployment.yaml │ ├── service.yaml │ ├── ingress.yaml │ ├── hpa.yaml │ ├── configmap.yaml │ ├── secret.yaml │ └── serviceaccount.yaml ``` **values.yaml:** ```yaml replicaCount: 3 image: repository: ghcr.io/org/api tag: latest pullPolicy: IfNotPresent service: type: ClusterIP port: 80 ingress: enabled: true className: nginx hosts: - host: api.example.com paths: - path: / pathType: Prefix resources: requests: cpu: 100m memory: 256Mi limits: cpu: 500m memory: 512Mi autoscaling: enabled: true minReplicas: 3 maxReplicas: 20 targetCPUUtilizationPercentage: 70 env: NODE_ENV: production ``` ## Docker ### Multi-Stage Dockerfile ```dockerfile # Build stage FROM node:20-alpine AS builder WORKDIR /app COPY package*.json ./ RUN npm ci COPY . . RUN npm run build # Production stage FROM node:20-alpine AS production WORKDIR /app RUN addgroup -g 1001 -S nodejs && \ adduser -S nodejs -u 1001 COPY --from=builder /app/dist ./dist COPY --from=builder /app/node_modules ./node_modules COPY --from=builder /app/package.json ./ USER nodejs EXPOSE 3000 HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ CMD wget --no-verbose --tries=1 --spider http://localhost:3000/health || exit 1 CMD ["node", "dist/main.js"] ``` ### Docker Compose ```yaml version: '3.8' services: api: build: context: . target: production ports: - "3000:3000" environment: - DATABASE_URL=postgresql://user:pass@db:5432/app - REDIS_URL=redis://redis:6379 depends_on: db: condition: service_healthy redis: condition: service_started healthcheck: test: ["CMD", "wget", "--spider", "http://localhost:3000/health"] interval: 30s timeout: 10s retries: 3 db: image: postgres:15-alpine environment: POSTGRES_USER: user POSTGRES_PASSWORD: pass POSTGRES_DB: app volumes: - postgres_data:/var/lib/postgresql/data healthcheck: test: ["CMD-SHELL", "pg_isready -U user -d app"] interval: 10s timeout: 5s retries: 5 redis: image: redis:7-alpine volumes: - redis_data:/data volumes: postgres_data: redis_data: ``` ## Monitoring ### Prometheus Configuration ```yaml global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 rule_files: - /etc/prometheus/rules/*.yml scrape_configs: - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) ``` ### Alerting Rules ```yaml groups: - name: api-alerts rules: - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 5m labels: severity: critical annotations: summary: High error rate detected description: Error rate is {{ $value | humanizePercentage }} - alert: HighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: High latency detected description: p95 latency is {{ $value | humanizeDuration }} - alert: PodCrashLooping expr: rate(kube_pod_container_status_restarts_total[15m]) > 0 for: 5m labels: severity: warning annotations: summary: Pod is crash looping description: Pod {{ $labels.pod }} is restarting frequently ``` ## Reference Materials - `references/cicd_patterns.md` - CI/CD best practices - `references/kubernetes_guide.md` - K8s deployment patterns - `references/terraform_modules.md` - IaC module library - `references/monitoring_setup.md` - Observability guide ## Scripts ```bash # Pipeline generator python scripts/pipeline_gen.py --type github-actions --lang node # Infrastructure scaffolder python scripts/infra_scaffold.py --provider aws --service eks # Deployment manager python scripts/deploy.py --env production --version v1.2.3 # Cost analyzer python scripts/cost_analyzer.py --account production ```