feat: Implement Phase 3 automation for K8s agent system
Automation components for scheduled and event-driven workflows:
Scheduler:
- scheduler.sh for cron-based workflow execution
- Logs workflow runs to ~/.claude/logs/workflows/
- Notifies dashboard on completion
Alertmanager Integration:
- webhook-receiver.sh for processing alerts
- Dashboard endpoint /api/webhooks/alertmanager
- Example alertmanager-config.yaml with routing rules
- Maps alerts to workflows (crashloop, node issues, resources)
New Incident Workflows:
- node-issue-response.yaml: Handle NotReady/unreachable nodes
- resource-pressure-response.yaml: Respond to memory/CPU overcommit
- argocd-sync-failure.yaml: Investigate and fix sync failures
Dashboard Updates:
- POST /api/webhooks/alertmanager endpoint
- POST /api/workflows/{name}/complete endpoint
- Alerts create pending actions for visibility
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
55
automation/alertmanager-config.yaml
Normal file
55
automation/alertmanager-config.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
# Alertmanager Configuration Example
|
||||
# Add this webhook receiver to send alerts to the K8s agent system
|
||||
#
|
||||
# This is an EXAMPLE - merge with your existing alertmanager config
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
receiver: 'default'
|
||||
group_by: ['alertname', 'namespace']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
# Route pod-related alerts to the agent
|
||||
- match_re:
|
||||
alertname: ^(KubePodCrashLooping|KubePodNotReady|KubeContainerWaiting)$
|
||||
receiver: 'k8s-agent'
|
||||
continue: true
|
||||
|
||||
# Route node-related alerts to the agent
|
||||
- match_re:
|
||||
alertname: ^(KubeNodeNotReady|KubeNodeUnreachable|NodeMemoryHighUsage)$
|
||||
receiver: 'k8s-agent'
|
||||
continue: true
|
||||
|
||||
# Route resource alerts to the agent
|
||||
- match_re:
|
||||
alertname: ^(KubeMemoryOvercommit|KubeCPUOvercommit|KubePersistentVolumeFillingUp)$
|
||||
receiver: 'k8s-agent'
|
||||
continue: true
|
||||
|
||||
receivers:
|
||||
- name: 'default'
|
||||
# Your default receiver (email, slack, etc.)
|
||||
|
||||
- name: 'k8s-agent'
|
||||
webhook_configs:
|
||||
# Option 1: Dashboard webhook endpoint
|
||||
- url: 'http://k8s-agent-dashboard.k8s-agent.svc.cluster.local/api/webhooks/alertmanager'
|
||||
send_resolved: true
|
||||
|
||||
# Option 2: External webhook receiver (if dashboard not available)
|
||||
# - url: 'http://your-webhook-host:9000/alerts'
|
||||
# send_resolved: true
|
||||
|
||||
# Inhibit rules - prevent duplicate alerts
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'namespace']
|
||||
89
automation/scheduler.sh
Executable file
89
automation/scheduler.sh
Executable file
@@ -0,0 +1,89 @@
|
||||
#!/bin/bash
|
||||
# K8s Agent Workflow Scheduler
|
||||
# Run workflows on a schedule using Claude Code CLI
|
||||
#
|
||||
# Usage:
|
||||
# ./scheduler.sh <workflow-name>
|
||||
# ./scheduler.sh cluster-health-check
|
||||
#
|
||||
# Cron examples:
|
||||
# # Health check every 6 hours
|
||||
# 0 */6 * * * /home/will/.claude/automation/scheduler.sh cluster-health-check
|
||||
#
|
||||
# # Daily resource report at 8am
|
||||
# 0 8 * * * /home/will/.claude/automation/scheduler.sh daily-report
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
CLAUDE_DIR="${SCRIPT_DIR}/.."
|
||||
LOG_DIR="${CLAUDE_DIR}/logs/workflows"
|
||||
WORKFLOW_DIR="${CLAUDE_DIR}/workflows"
|
||||
|
||||
# Ensure log directory exists
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
# Get workflow name
|
||||
WORKFLOW="${1:-}"
|
||||
if [[ -z "${WORKFLOW}" ]]; then
|
||||
echo "Usage: $0 <workflow-name>"
|
||||
echo "Available workflows:"
|
||||
find "${WORKFLOW_DIR}" -name "*.yaml" -o -name "*.md" | while read -r f; do
|
||||
basename "${f}" | sed 's/\.[^.]*$//'
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Find workflow file
|
||||
WORKFLOW_FILE=""
|
||||
for ext in yaml yml md; do
|
||||
for dir in health deploy incidents; do
|
||||
if [[ -f "${WORKFLOW_DIR}/${dir}/${WORKFLOW}.${ext}" ]]; then
|
||||
WORKFLOW_FILE="${WORKFLOW_DIR}/${dir}/${WORKFLOW}.${ext}"
|
||||
break 2
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [[ -z "${WORKFLOW_FILE}" ]]; then
|
||||
echo "Error: Workflow '${WORKFLOW}' not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Generate log filename
|
||||
TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
|
||||
LOG_FILE="${LOG_DIR}/${WORKFLOW}_${TIMESTAMP}.log"
|
||||
|
||||
echo "Running workflow: ${WORKFLOW}"
|
||||
echo "Workflow file: ${WORKFLOW_FILE}"
|
||||
echo "Log file: ${LOG_FILE}"
|
||||
|
||||
# Run Claude Code with the workflow
|
||||
# Using --print to get output, --dangerously-skip-permissions for automation
|
||||
{
|
||||
echo "=== Workflow: ${WORKFLOW} ==="
|
||||
echo "=== Started: $(date) ==="
|
||||
echo ""
|
||||
|
||||
# Read workflow and pass to Claude Code
|
||||
claude --print --dangerously-skip-permissions \
|
||||
"Run the following workflow: $(cat "${WORKFLOW_FILE}")" \
|
||||
2>&1
|
||||
|
||||
EXIT_CODE=$?
|
||||
|
||||
echo ""
|
||||
echo "=== Completed: $(date) ==="
|
||||
echo "=== Exit code: ${EXIT_CODE} ==="
|
||||
} | tee "${LOG_FILE}"
|
||||
|
||||
# Notify dashboard of completion (if running)
|
||||
DASHBOARD_URL="${DASHBOARD_URL:-http://localhost:8080}"
|
||||
if curl -s "${DASHBOARD_URL}/api/health" > /dev/null 2>&1; then
|
||||
curl -s -X POST "${DASHBOARD_URL}/api/workflows/${WORKFLOW}/complete" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"log_file\": \"${LOG_FILE}\", \"exit_code\": ${EXIT_CODE:-0}}" \
|
||||
> /dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
exit ${EXIT_CODE:-0}
|
||||
67
automation/webhook-receiver.sh
Executable file
67
automation/webhook-receiver.sh
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
# Alertmanager Webhook Receiver
|
||||
# Receives alerts and triggers appropriate workflows
|
||||
#
|
||||
# This script is designed to be called by a simple webhook server
|
||||
# or can be integrated into the dashboard.
|
||||
#
|
||||
# Usage:
|
||||
# echo '{"alerts": [...]}' | ./webhook-receiver.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
CLAUDE_DIR="${SCRIPT_DIR}/.."
|
||||
LOG_DIR="${CLAUDE_DIR}/logs/webhooks"
|
||||
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
# Read JSON from stdin
|
||||
PAYLOAD=$(cat)
|
||||
TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
|
||||
LOG_FILE="${LOG_DIR}/alert_${TIMESTAMP}.json"
|
||||
|
||||
# Log the payload
|
||||
echo "${PAYLOAD}" > "${LOG_FILE}"
|
||||
|
||||
# Parse alerts and trigger workflows
|
||||
echo "${PAYLOAD}" | jq -c '.alerts[]' 2>/dev/null | while read -r alert; do
|
||||
ALERTNAME=$(echo "${alert}" | jq -r '.labels.alertname // "unknown"')
|
||||
STATUS=$(echo "${alert}" | jq -r '.status // "firing"')
|
||||
NAMESPACE=$(echo "${alert}" | jq -r '.labels.namespace // "default"')
|
||||
POD=$(echo "${alert}" | jq -r '.labels.pod // ""')
|
||||
|
||||
echo "Processing alert: ${ALERTNAME} (${STATUS})"
|
||||
|
||||
# Only process firing alerts
|
||||
if [[ "${STATUS}" != "firing" ]]; then
|
||||
echo " Skipping resolved alert"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Map alerts to workflows
|
||||
case "${ALERTNAME}" in
|
||||
KubePodCrashLooping|KubePodNotReady)
|
||||
echo " Triggering pod-crashloop workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" pod-crashloop-remediation \
|
||||
--namespace "${NAMESPACE}" --pod "${POD}" &
|
||||
;;
|
||||
KubeNodeNotReady|KubeNodeUnreachable)
|
||||
echo " Triggering node-issue workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" node-issue-response &
|
||||
;;
|
||||
KubeMemoryOvercommit|KubeCPUOvercommit)
|
||||
echo " Triggering resource-pressure workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" resource-pressure-response &
|
||||
;;
|
||||
TargetDown|PrometheusTargetMissing)
|
||||
echo " Triggering target-down workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" target-down-investigation &
|
||||
;;
|
||||
*)
|
||||
echo " No workflow mapped for ${ALERTNAME}"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Webhook processing complete"
|
||||
Reference in New Issue
Block a user