feat: Implement Phase 3 automation for K8s agent system
Automation components for scheduled and event-driven workflows:
Scheduler:
- scheduler.sh for cron-based workflow execution
- Logs workflow runs to ~/.claude/logs/workflows/
- Notifies dashboard on completion
Alertmanager Integration:
- webhook-receiver.sh for processing alerts
- Dashboard endpoint /api/webhooks/alertmanager
- Example alertmanager-config.yaml with routing rules
- Maps alerts to workflows (crashloop, node issues, resources)
New Incident Workflows:
- node-issue-response.yaml: Handle NotReady/unreachable nodes
- resource-pressure-response.yaml: Respond to memory/CPU overcommit
- argocd-sync-failure.yaml: Investigate and fix sync failures
Dashboard Updates:
- POST /api/webhooks/alertmanager endpoint
- POST /api/workflows/{name}/complete endpoint
- Alerts create pending actions for visibility
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
67
automation/webhook-receiver.sh
Executable file
67
automation/webhook-receiver.sh
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
# Alertmanager Webhook Receiver
|
||||
# Receives alerts and triggers appropriate workflows
|
||||
#
|
||||
# This script is designed to be called by a simple webhook server
|
||||
# or can be integrated into the dashboard.
|
||||
#
|
||||
# Usage:
|
||||
# echo '{"alerts": [...]}' | ./webhook-receiver.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
CLAUDE_DIR="${SCRIPT_DIR}/.."
|
||||
LOG_DIR="${CLAUDE_DIR}/logs/webhooks"
|
||||
|
||||
mkdir -p "${LOG_DIR}"
|
||||
|
||||
# Read JSON from stdin
|
||||
PAYLOAD=$(cat)
|
||||
TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
|
||||
LOG_FILE="${LOG_DIR}/alert_${TIMESTAMP}.json"
|
||||
|
||||
# Log the payload
|
||||
echo "${PAYLOAD}" > "${LOG_FILE}"
|
||||
|
||||
# Parse alerts and trigger workflows
|
||||
echo "${PAYLOAD}" | jq -c '.alerts[]' 2>/dev/null | while read -r alert; do
|
||||
ALERTNAME=$(echo "${alert}" | jq -r '.labels.alertname // "unknown"')
|
||||
STATUS=$(echo "${alert}" | jq -r '.status // "firing"')
|
||||
NAMESPACE=$(echo "${alert}" | jq -r '.labels.namespace // "default"')
|
||||
POD=$(echo "${alert}" | jq -r '.labels.pod // ""')
|
||||
|
||||
echo "Processing alert: ${ALERTNAME} (${STATUS})"
|
||||
|
||||
# Only process firing alerts
|
||||
if [[ "${STATUS}" != "firing" ]]; then
|
||||
echo " Skipping resolved alert"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Map alerts to workflows
|
||||
case "${ALERTNAME}" in
|
||||
KubePodCrashLooping|KubePodNotReady)
|
||||
echo " Triggering pod-crashloop workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" pod-crashloop-remediation \
|
||||
--namespace "${NAMESPACE}" --pod "${POD}" &
|
||||
;;
|
||||
KubeNodeNotReady|KubeNodeUnreachable)
|
||||
echo " Triggering node-issue workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" node-issue-response &
|
||||
;;
|
||||
KubeMemoryOvercommit|KubeCPUOvercommit)
|
||||
echo " Triggering resource-pressure workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" resource-pressure-response &
|
||||
;;
|
||||
TargetDown|PrometheusTargetMissing)
|
||||
echo " Triggering target-down workflow"
|
||||
"${SCRIPT_DIR}/scheduler.sh" target-down-investigation &
|
||||
;;
|
||||
*)
|
||||
echo " No workflow mapped for ${ALERTNAME}"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "Webhook processing complete"
|
||||
Reference in New Issue
Block a user