feat: Implement Phase 3 automation for K8s agent system

Automation components for scheduled and event-driven workflows: Scheduler: - scheduler.sh for cron-based workflow execution - Logs workflow runs to ~/.claude/logs/workflows/ - Notifies dashboard on completion Alertmanager Integration: - webhook-receiver.sh for processing alerts - Dashboard endpoint /api/webhooks/alertmanager - Example alertmanager-config.yaml with routing rules - Maps alerts to workflows (crashloop, node issues, resources) New Incident Workflows: - node-issue-response.yaml: Handle NotReady/unreachable nodes - resource-pressure-response.yaml: Respond to memory/CPU overcommit - argocd-sync-failure.yaml: Investigate and fix sync failures Dashboard Updates: - POST /api/webhooks/alertmanager endpoint - POST /api/workflows/{name}/complete endpoint - Alerts create pending actions for visibility 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 11:49:05 -08:00
parent 5646508adb
commit c14bae9a12
8 changed files with 623 additions and 2 deletions
--- a/workflows/incidents/argocd-sync-failure.yaml
+++ b/workflows/incidents/argocd-sync-failure.yaml
@@ -0,0 +1,110 @@
+name: argocd-sync-failure
+description: Investigate and resolve ArgoCD sync failures
+version: "1.0"
+
+trigger:
+  - alert:
+      match:
+        alertname: ArgoCDAppOutOfSync
+  - alert:
+      match:
+        alertname: ArgoCDAppSyncFailed
+  - manual: true
+    inputs:
+      - name: app
+        description: ArgoCD application name
+        required: true
+
+defaults:
+  model: sonnet
+
+steps:
+  - name: get-app-status
+    agent: argocd-operator
+    model: haiku
+    task: |
+      Get detailed status of the application:
+      - App name: {{ inputs.app | default(alert.labels.name) }}
+      - Sync status and message
+      - Health status
+      - Last sync attempt and result
+      - Current revision vs target revision
+    output: app_status
+
+  - name: check-diff
+    agent: argocd-operator
+    model: sonnet
+    task: |
+      Analyze the diff between desired and live state:
+      - Run argocd app diff
+      - Identify what resources differ
+      - Check for drift vs intentional changes
+
+      App: {{ steps.get-app-status.output.app_name }}
+    output: diff_analysis
+
+  - name: check-git
+    agent: git-operator
+    model: haiku
+    task: |
+      Check the GitOps repo for recent changes:
+      - Recent commits to the app path
+      - Any open PRs affecting this app
+      - Validate manifest syntax
+
+      App path: {{ steps.get-app-status.output.source_path }}
+    output: git_status
+
+  - name: check-resources
+    agent: k8s-diagnostician
+    model: haiku
+    task: |
+      Check related Kubernetes resources:
+      - Pod status in the app namespace
+      - Any pending resources
+      - Events related to the app
+
+      Namespace: {{ steps.get-app-status.output.namespace }}
+    output: k8s_status
+
+  - name: diagnose-and-fix
+    agent: k8s-orchestrator
+    model: sonnet
+    task: |
+      Diagnose sync failure and recommend fix:
+
+      Evidence:
+      - App status: {{ steps.get-app-status.output }}
+      - Diff analysis: {{ steps.check-diff.output }}
+      - Git status: {{ steps.check-git.output }}
+      - K8s resources: {{ steps.check-resources.output }}
+
+      Common causes:
+      1. Resource conflict (another controller managing resource)
+      2. Invalid manifest (syntax or semantic error)
+      3. Missing dependencies (CRDs, secrets, configmaps)
+      4. Resource quota exceeded
+      5. Image pull failures
+
+      Provide:
+      - Root cause
+      - Fix recommendation
+      - Whether to retry sync or fix manifest first
+    output: diagnosis
+
+  - name: attempt-resync
+    condition: "{{ steps.diagnose-and-fix.output.should_retry }}"
+    agent: argocd-operator
+    model: haiku
+    task: |
+      Attempt to resync the application:
+      - Refresh application state
+      - If diagnosis suggests, run sync with --force
+
+      App: {{ steps.get-app-status.output.app_name }}
+    output: resync_result
+    confirm: true
+
+outputs:
+  - diagnosis
+  - resync_result
--- a/workflows/incidents/node-issue-response.yaml
+++ b/workflows/incidents/node-issue-response.yaml
@@ -0,0 +1,108 @@
+name: node-issue-response
+description: Respond to node issues (NotReady, unreachable)
+version: "1.0"
+
+trigger:
+  - alert:
+      match:
+        alertname: KubeNodeNotReady
+  - alert:
+      match:
+        alertname: KubeNodeUnreachable
+  - manual: true
+    inputs:
+      - name: node
+        description: Node name
+        required: true
+
+defaults:
+  model: sonnet
+
+steps:
+  - name: identify-node
+    agent: k8s-diagnostician
+    model: haiku
+    task: |
+      Identify the affected node:
+      - Node: {{ inputs.node | default(alert.labels.node) }}
+
+      Get node details:
+      - Current conditions
+      - Last heartbeat time
+      - Kubelet status
+      - Resource capacity vs allocatable
+    output: node_info
+
+  - name: check-workloads
+    agent: k8s-diagnostician
+    model: haiku
+    task: |
+      List workloads on the affected node:
+      - Pods running on the node
+      - Any pods in Pending state due to node issues
+      - DaemonSets that should be on this node
+
+      Node: {{ steps.identify-node.output.node_name }}
+    output: workload_status
+
+  - name: check-metrics
+    agent: prometheus-analyst
+    model: haiku
+    task: |
+      Check node metrics history:
+      - CPU/memory usage trend before issue
+      - Network connectivity metrics
+      - Disk I/O and space
+      - Any anomalies in last hour
+
+      Node: {{ steps.identify-node.output.node_name }}
+    output: metrics_analysis
+
+  - name: diagnose-and-recommend
+    agent: k8s-orchestrator
+    model: sonnet
+    task: |
+      Analyze the node issue:
+
+      Evidence:
+      - Node info: {{ steps.identify-node.output }}
+      - Workloads: {{ steps.check-workloads.output }}
+      - Metrics: {{ steps.check-metrics.output }}
+
+      Determine:
+      1. Root cause (network, resource exhaustion, kubelet crash, hardware)
+      2. Impact (number of affected pods, critical workloads)
+      3. Recovery options
+
+      For Pi cluster context:
+      - Pi 5 nodes: Can handle more recovery actions
+      - Pi 3 node: Be conservative, limited resources
+
+      Recommend actions with risk classification.
+    output: diagnosis
+
+  - name: safe-actions
+    condition: "{{ steps.diagnose-and-recommend.output.has_safe_action }}"
+    agent: k8s-diagnostician
+    model: haiku
+    task: |
+      Execute safe recovery actions:
+      - Attempt to reschedule affected pods
+      - Check if node recovers on its own
+
+      Do NOT:
+      - Drain the node (requires confirmation)
+      - Cordon the node (requires confirmation)
+    output: recovery_result
+
+outputs:
+  - diagnosis
+  - recovery_result
+
+notifications:
+  on_complete:
+    summary: |
+      Node issue response for {{ steps.identify-node.output.node_name }}:
+      - Status: {{ steps.diagnose-and-recommend.output.node_status }}
+      - Root cause: {{ steps.diagnose-and-recommend.output.root_cause }}
+      - Affected pods: {{ steps.check-workloads.output.pod_count }}
--- a/workflows/incidents/resource-pressure-response.yaml
+++ b/workflows/incidents/resource-pressure-response.yaml
@@ -0,0 +1,97 @@
+name: resource-pressure-response
+description: Respond to cluster resource pressure alerts
+version: "1.0"
+
+trigger:
+  - alert:
+      match:
+        alertname: KubeMemoryOvercommit
+  - alert:
+      match:
+        alertname: KubeCPUOvercommit
+  - manual: true
+
+defaults:
+  model: sonnet
+
+steps:
+  - name: assess-pressure
+    agent: prometheus-analyst
+    model: sonnet
+    task: |
+      Assess current resource pressure:
+      - Per-node CPU usage and requests vs limits
+      - Per-node memory usage and requests vs limits
+      - Identify nodes under most pressure
+      - Check for OOM events in last hour
+
+      Focus on Pi cluster constraints:
+      - Pi 5 (8GB): Higher capacity
+      - Pi 3 (1GB): Very limited, check if overloaded
+    output: pressure_analysis
+
+  - name: identify-hogs
+    agent: k8s-diagnostician
+    model: haiku
+    task: |
+      Identify resource-heavy workloads:
+      - Top 5 pods by CPU usage
+      - Top 5 pods by memory usage
+      - Any pods exceeding their requests
+      - Any pods with no limits set
+    output: resource_hogs
+
+  - name: check-scaling
+    agent: argocd-operator
+    model: haiku
+    task: |
+      Check if any deployments can be scaled:
+      - List deployments with >1 replica
+      - Check HPA configurations
+      - Identify candidates for scale-down
+    output: scaling_options
+
+  - name: recommend-actions
+    agent: k8s-orchestrator
+    model: sonnet
+    task: |
+      Recommend resource optimization actions:
+
+      Analysis:
+      - Pressure: {{ steps.assess-pressure.output }}
+      - Top consumers: {{ steps.identify-hogs.output }}
+      - Scaling options: {{ steps.check-scaling.output }}
+
+      Prioritize actions by impact and safety:
+
+      [SAFE] - Can be auto-applied:
+      - Clean up completed jobs/pods
+      - Identify and report issues
+
+      [CONFIRM] - Require approval:
+      - Scale down non-critical deployments
+      - Adjust resource limits
+      - Evict low-priority pods
+
+      [FORBIDDEN] - Never auto-apply:
+      - Delete PVCs
+      - Delete critical workloads
+    output: recommendations
+
+  - name: cleanup
+    agent: k8s-diagnostician
+    model: haiku
+    task: |
+      Perform safe cleanup actions:
+      - Delete completed jobs older than 1 hour
+      - Delete succeeded pods
+      - Delete failed pods older than 24 hours
+
+      Report what was cleaned up.
+    output: cleanup_result
+    confirm: false
+
+outputs:
+  - pressure_analysis
+  - recommendations
+  - cleanup_result