feat: Implement Phase 3 automation for K8s agent system

Automation components for scheduled and event-driven workflows: Scheduler: - scheduler.sh for cron-based workflow execution - Logs workflow runs to ~/.claude/logs/workflows/ - Notifies dashboard on completion Alertmanager Integration: - webhook-receiver.sh for processing alerts - Dashboard endpoint /api/webhooks/alertmanager - Example alertmanager-config.yaml with routing rules - Maps alerts to workflows (crashloop, node issues, resources) New Incident Workflows: - node-issue-response.yaml: Handle NotReady/unreachable nodes - resource-pressure-response.yaml: Respond to memory/CPU overcommit - argocd-sync-failure.yaml: Investigate and fix sync failures Dashboard Updates: - POST /api/webhooks/alertmanager endpoint - POST /api/workflows/{name}/complete endpoint - Alerts create pending actions for visibility 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 11:49:05 -08:00
parent 5646508adb
commit c14bae9a12
8 changed files with 623 additions and 2 deletions
--- a/automation/alertmanager-config.yaml
+++ b/automation/alertmanager-config.yaml
@@ -0,0 +1,55 @@
+# Alertmanager Configuration Example
+# Add this webhook receiver to send alerts to the K8s agent system
+#
+# This is an EXAMPLE - merge with your existing alertmanager config
+
+global:
+  resolve_timeout: 5m
+
+route:
+  receiver: 'default'
+  group_by: ['alertname', 'namespace']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+
+  routes:
+    # Route pod-related alerts to the agent
+    - match_re:
+        alertname: ^(KubePodCrashLooping|KubePodNotReady|KubeContainerWaiting)$
+      receiver: 'k8s-agent'
+      continue: true
+
+    # Route node-related alerts to the agent
+    - match_re:
+        alertname: ^(KubeNodeNotReady|KubeNodeUnreachable|NodeMemoryHighUsage)$
+      receiver: 'k8s-agent'
+      continue: true
+
+    # Route resource alerts to the agent
+    - match_re:
+        alertname: ^(KubeMemoryOvercommit|KubeCPUOvercommit|KubePersistentVolumeFillingUp)$
+      receiver: 'k8s-agent'
+      continue: true
+
+receivers:
+  - name: 'default'
+    # Your default receiver (email, slack, etc.)
+
+  - name: 'k8s-agent'
+    webhook_configs:
+      # Option 1: Dashboard webhook endpoint
+      - url: 'http://k8s-agent-dashboard.k8s-agent.svc.cluster.local/api/webhooks/alertmanager'
+        send_resolved: true
+
+      # Option 2: External webhook receiver (if dashboard not available)
+      # - url: 'http://your-webhook-host:9000/alerts'
+      #   send_resolved: true
+
+# Inhibit rules - prevent duplicate alerts
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'namespace']
--- a/automation/scheduler.sh
+++ b/automation/scheduler.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# K8s Agent Workflow Scheduler
+# Run workflows on a schedule using Claude Code CLI
+#
+# Usage:
+#   ./scheduler.sh <workflow-name>
+#   ./scheduler.sh cluster-health-check
+#
+# Cron examples:
+#   # Health check every 6 hours
+#   0 */6 * * * /home/will/.claude/automation/scheduler.sh cluster-health-check
+#
+#   # Daily resource report at 8am
+#   0 8 * * * /home/will/.claude/automation/scheduler.sh daily-report
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CLAUDE_DIR="${SCRIPT_DIR}/.."
+LOG_DIR="${CLAUDE_DIR}/logs/workflows"
+WORKFLOW_DIR="${CLAUDE_DIR}/workflows"
+
+# Ensure log directory exists
+mkdir -p "${LOG_DIR}"
+
+# Get workflow name
+WORKFLOW="${1:-}"
+if [[ -z "${WORKFLOW}" ]]; then
+    echo "Usage: $0 <workflow-name>"
+    echo "Available workflows:"
+    find "${WORKFLOW_DIR}" -name "*.yaml" -o -name "*.md" | while read -r f; do
+        basename "${f}" | sed 's/\.[^.]*$//'
+    done
+    exit 1
+fi
+
+# Find workflow file
+WORKFLOW_FILE=""
+for ext in yaml yml md; do
+    for dir in health deploy incidents; do
+        if [[ -f "${WORKFLOW_DIR}/${dir}/${WORKFLOW}.${ext}" ]]; then
+            WORKFLOW_FILE="${WORKFLOW_DIR}/${dir}/${WORKFLOW}.${ext}"
+            break 2
+        fi
+    done
+done
+
+if [[ -z "${WORKFLOW_FILE}" ]]; then
+    echo "Error: Workflow '${WORKFLOW}' not found"
+    exit 1
+fi
+
+# Generate log filename
+TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
+LOG_FILE="${LOG_DIR}/${WORKFLOW}_${TIMESTAMP}.log"
+
+echo "Running workflow: ${WORKFLOW}"
+echo "Workflow file: ${WORKFLOW_FILE}"
+echo "Log file: ${LOG_FILE}"
+
+# Run Claude Code with the workflow
+# Using --print to get output, --dangerously-skip-permissions for automation
+{
+    echo "=== Workflow: ${WORKFLOW} ==="
+    echo "=== Started: $(date) ==="
+    echo ""
+
+    # Read workflow and pass to Claude Code
+    claude --print --dangerously-skip-permissions \
+        "Run the following workflow: $(cat "${WORKFLOW_FILE}")" \
+        2>&1
+
+    EXIT_CODE=$?
+
+    echo ""
+    echo "=== Completed: $(date) ==="
+    echo "=== Exit code: ${EXIT_CODE} ==="
+} | tee "${LOG_FILE}"
+
+# Notify dashboard of completion (if running)
+DASHBOARD_URL="${DASHBOARD_URL:-http://localhost:8080}"
+if curl -s "${DASHBOARD_URL}/api/health" > /dev/null 2>&1; then
+    curl -s -X POST "${DASHBOARD_URL}/api/workflows/${WORKFLOW}/complete" \
+        -H "Content-Type: application/json" \
+        -d "{\"log_file\": \"${LOG_FILE}\", \"exit_code\": ${EXIT_CODE:-0}}" \
+        > /dev/null 2>&1 || true
+fi
+
+exit ${EXIT_CODE:-0}
--- a/automation/webhook-receiver.sh
+++ b/automation/webhook-receiver.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Alertmanager Webhook Receiver
+# Receives alerts and triggers appropriate workflows
+#
+# This script is designed to be called by a simple webhook server
+# or can be integrated into the dashboard.
+#
+# Usage:
+#   echo '{"alerts": [...]}' | ./webhook-receiver.sh
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CLAUDE_DIR="${SCRIPT_DIR}/.."
+LOG_DIR="${CLAUDE_DIR}/logs/webhooks"
+
+mkdir -p "${LOG_DIR}"
+
+# Read JSON from stdin
+PAYLOAD=$(cat)
+TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
+LOG_FILE="${LOG_DIR}/alert_${TIMESTAMP}.json"
+
+# Log the payload
+echo "${PAYLOAD}" > "${LOG_FILE}"
+
+# Parse alerts and trigger workflows
+echo "${PAYLOAD}" | jq -c '.alerts[]' 2>/dev/null | while read -r alert; do
+    ALERTNAME=$(echo "${alert}" | jq -r '.labels.alertname // "unknown"')
+    STATUS=$(echo "${alert}" | jq -r '.status // "firing"')
+    NAMESPACE=$(echo "${alert}" | jq -r '.labels.namespace // "default"')
+    POD=$(echo "${alert}" | jq -r '.labels.pod // ""')
+
+    echo "Processing alert: ${ALERTNAME} (${STATUS})"
+
+    # Only process firing alerts
+    if [[ "${STATUS}" != "firing" ]]; then
+        echo "  Skipping resolved alert"
+        continue
+    fi
+
+    # Map alerts to workflows
+    case "${ALERTNAME}" in
+        KubePodCrashLooping|KubePodNotReady)
+            echo "  Triggering pod-crashloop workflow"
+            "${SCRIPT_DIR}/scheduler.sh" pod-crashloop-remediation \
+                --namespace "${NAMESPACE}" --pod "${POD}" &
+            ;;
+        KubeNodeNotReady|KubeNodeUnreachable)
+            echo "  Triggering node-issue workflow"
+            "${SCRIPT_DIR}/scheduler.sh" node-issue-response &
+            ;;
+        KubeMemoryOvercommit|KubeCPUOvercommit)
+            echo "  Triggering resource-pressure workflow"
+            "${SCRIPT_DIR}/scheduler.sh" resource-pressure-response &
+            ;;
+        TargetDown|PrometheusTargetMissing)
+            echo "  Triggering target-down workflow"
+            "${SCRIPT_DIR}/scheduler.sh" target-down-investigation &
+            ;;
+        *)
+            echo "  No workflow mapped for ${ALERTNAME}"
+            ;;
+    esac
+done
+
+echo "Webhook processing complete"