claude-code/workflows/health/cluster-health-check.yaml

name: cluster-health-check
description: Comprehensive cluster health assessment
version: "1.0"

trigger:
  - schedule: "0 */6 * * *"  # every 6 hours
  - manual: true

defaults:
  model: sonnet

steps:
  - name: check-nodes
    agent: k8s-diagnostician
    model: haiku
    task: |
      Get node status for all nodes:
      - Check node conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
      - Report any nodes not in Ready state
      - Check resource usage with kubectl top nodes
    output: node_status

  - name: check-pods
    agent: k8s-diagnostician
    model: haiku
    task: |
      Get pod status across all namespaces:
      - Count pods by status (Running, Pending, Failed, CrashLoopBackOff)
      - List any unhealthy pods with their namespace and reason
      - Check for high restart counts (>5 in last hour)
    output: pod_status

  - name: check-metrics
    agent: prometheus-analyst
    model: haiku
    task: |
      Query key cluster metrics:
      - Node CPU and memory usage (current and 1h average)
      - Top 5 pods by CPU usage
      - Top 5 pods by memory usage
      - Any active firing alerts
    output: metrics_summary

  - name: check-argocd
    agent: argocd-operator
    model: haiku
    task: |
      Check ArgoCD application status:
      - List all applications with sync and health status
      - Report any apps that are OutOfSync or Degraded
      - Note last sync time for each app
    output: argocd_status

  - name: analyze-and-report
    agent: k8s-orchestrator
    model: sonnet
    task: |
      Analyze the health check results and create a summary report:

      Inputs:
      - Node status: {{ steps.check-nodes.output }}
      - Pod status: {{ steps.check-pods.output }}
      - Metrics: {{ steps.check-metrics.output }}
      - ArgoCD: {{ steps.check-argocd.output }}

      Create a report with:
      1. Overall cluster health (Healthy/Degraded/Critical)
      2. Summary table of key metrics
      3. List of issues found (if any)
      4. Recommended actions (mark as safe/confirm)

      If issues are critical, propose immediate remediation steps.
    output: health_report
    confirm_if: actions_proposed

outputs:
  - health_report
  - node_status
  - pod_status