Automation components for scheduled and event-driven workflows:
Scheduler:
- scheduler.sh for cron-based workflow execution
- Logs workflow runs to ~/.claude/logs/workflows/
- Notifies dashboard on completion
Alertmanager Integration:
- webhook-receiver.sh for processing alerts
- Dashboard endpoint /api/webhooks/alertmanager
- Example alertmanager-config.yaml with routing rules
- Maps alerts to workflows (crashloop, node issues, resources)
New Incident Workflows:
- node-issue-response.yaml: Handle NotReady/unreachable nodes
- resource-pressure-response.yaml: Respond to memory/CPU overcommit
- argocd-sync-failure.yaml: Investigate and fix sync failures
Dashboard Updates:
- POST /api/webhooks/alertmanager endpoint
- POST /api/workflows/{name}/complete endpoint
- Alerts create pending actions for visibility
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
68 lines
2.1 KiB
Bash
Executable File
68 lines
2.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# Alertmanager Webhook Receiver
|
|
# Receives alerts and triggers appropriate workflows
|
|
#
|
|
# This script is designed to be called by a simple webhook server
|
|
# or can be integrated into the dashboard.
|
|
#
|
|
# Usage:
|
|
# echo '{"alerts": [...]}' | ./webhook-receiver.sh
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
CLAUDE_DIR="${SCRIPT_DIR}/.."
|
|
LOG_DIR="${CLAUDE_DIR}/logs/webhooks"
|
|
|
|
mkdir -p "${LOG_DIR}"
|
|
|
|
# Read JSON from stdin
|
|
PAYLOAD=$(cat)
|
|
TIMESTAMP=$(date +%Y-%m-%d_%H-%M-%S)
|
|
LOG_FILE="${LOG_DIR}/alert_${TIMESTAMP}.json"
|
|
|
|
# Log the payload
|
|
echo "${PAYLOAD}" > "${LOG_FILE}"
|
|
|
|
# Parse alerts and trigger workflows
|
|
echo "${PAYLOAD}" | jq -c '.alerts[]' 2>/dev/null | while read -r alert; do
|
|
ALERTNAME=$(echo "${alert}" | jq -r '.labels.alertname // "unknown"')
|
|
STATUS=$(echo "${alert}" | jq -r '.status // "firing"')
|
|
NAMESPACE=$(echo "${alert}" | jq -r '.labels.namespace // "default"')
|
|
POD=$(echo "${alert}" | jq -r '.labels.pod // ""')
|
|
|
|
echo "Processing alert: ${ALERTNAME} (${STATUS})"
|
|
|
|
# Only process firing alerts
|
|
if [[ "${STATUS}" != "firing" ]]; then
|
|
echo " Skipping resolved alert"
|
|
continue
|
|
fi
|
|
|
|
# Map alerts to workflows
|
|
case "${ALERTNAME}" in
|
|
KubePodCrashLooping|KubePodNotReady)
|
|
echo " Triggering pod-crashloop workflow"
|
|
"${SCRIPT_DIR}/scheduler.sh" pod-crashloop-remediation \
|
|
--namespace "${NAMESPACE}" --pod "${POD}" &
|
|
;;
|
|
KubeNodeNotReady|KubeNodeUnreachable)
|
|
echo " Triggering node-issue workflow"
|
|
"${SCRIPT_DIR}/scheduler.sh" node-issue-response &
|
|
;;
|
|
KubeMemoryOvercommit|KubeCPUOvercommit)
|
|
echo " Triggering resource-pressure workflow"
|
|
"${SCRIPT_DIR}/scheduler.sh" resource-pressure-response &
|
|
;;
|
|
TargetDown|PrometheusTargetMissing)
|
|
echo " Triggering target-down workflow"
|
|
"${SCRIPT_DIR}/scheduler.sh" target-down-investigation &
|
|
;;
|
|
*)
|
|
echo " No workflow mapped for ${ALERTNAME}"
|
|
;;
|
|
esac
|
|
done
|
|
|
|
echo "Webhook processing complete"
|