claude-code/workflows/incidents/node-issue-response.yaml

name: node-issue-response
description: Respond to node issues (NotReady, unreachable)
version: "1.0"

trigger:
  - alert:
      match:
        alertname: KubeNodeNotReady
  - alert:
      match:
        alertname: KubeNodeUnreachable
  - manual: true
    inputs:
      - name: node
        description: Node name
        required: true

defaults:
  model: sonnet

steps:
  - name: identify-node
    agent: k8s-diagnostician
    model: haiku
    task: |
      Identify the affected node:
      - Node: {{ inputs.node | default(alert.labels.node) }}

      Get node details:
      - Current conditions
      - Last heartbeat time
      - Kubelet status
      - Resource capacity vs allocatable
    output: node_info

  - name: check-workloads
    agent: k8s-diagnostician
    model: haiku
    task: |
      List workloads on the affected node:
      - Pods running on the node
      - Any pods in Pending state due to node issues
      - DaemonSets that should be on this node

      Node: {{ steps.identify-node.output.node_name }}
    output: workload_status

  - name: check-metrics
    agent: prometheus-analyst
    model: haiku
    task: |
      Check node metrics history:
      - CPU/memory usage trend before issue
      - Network connectivity metrics
      - Disk I/O and space
      - Any anomalies in last hour

      Node: {{ steps.identify-node.output.node_name }}
    output: metrics_analysis

  - name: diagnose-and-recommend
    agent: k8s-orchestrator
    model: sonnet
    task: |
      Analyze the node issue:

      Evidence:
      - Node info: {{ steps.identify-node.output }}
      - Workloads: {{ steps.check-workloads.output }}
      - Metrics: {{ steps.check-metrics.output }}

      Determine:
      1. Root cause (network, resource exhaustion, kubelet crash, hardware)
      2. Impact (number of affected pods, critical workloads)
      3. Recovery options

      For Pi cluster context:
      - Pi 5 nodes: Can handle more recovery actions
      - Pi 3 node: Be conservative, limited resources

      Recommend actions with risk classification.
    output: diagnosis

  - name: safe-actions
    condition: "{{ steps.diagnose-and-recommend.output.has_safe_action }}"
    agent: k8s-diagnostician
    model: haiku
    task: |
      Execute safe recovery actions:
      - Attempt to reschedule affected pods
      - Check if node recovers on its own

      Do NOT:
      - Drain the node (requires confirmation)
      - Cordon the node (requires confirmation)
    output: recovery_result

outputs:
  - diagnosis
  - recovery_result

notifications:
  on_complete:
    summary: |
      Node issue response for {{ steps.identify-node.output.node_name }}:
      - Status: {{ steps.diagnose-and-recommend.output.node_status }}
      - Root cause: {{ steps.diagnose-and-recommend.output.root_cause }}
      - Affected pods: {{ steps.check-workloads.output.pod_count }}