feat: Implement Phase 3 automation for K8s agent system
Automation components for scheduled and event-driven workflows:
Scheduler:
- scheduler.sh for cron-based workflow execution
- Logs workflow runs to ~/.claude/logs/workflows/
- Notifies dashboard on completion
Alertmanager Integration:
- webhook-receiver.sh for processing alerts
- Dashboard endpoint /api/webhooks/alertmanager
- Example alertmanager-config.yaml with routing rules
- Maps alerts to workflows (crashloop, node issues, resources)
New Incident Workflows:
- node-issue-response.yaml: Handle NotReady/unreachable nodes
- resource-pressure-response.yaml: Respond to memory/CPU overcommit
- argocd-sync-failure.yaml: Investigate and fix sync failures
Dashboard Updates:
- POST /api/webhooks/alertmanager endpoint
- POST /api/workflows/{name}/complete endpoint
- Alerts create pending actions for visibility
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
55
automation/alertmanager-config.yaml
Normal file
55
automation/alertmanager-config.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
# Alertmanager Configuration Example
|
||||
# Add this webhook receiver to send alerts to the K8s agent system
|
||||
#
|
||||
# This is an EXAMPLE - merge with your existing alertmanager config
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
receiver: 'default'
|
||||
group_by: ['alertname', 'namespace']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
routes:
|
||||
# Route pod-related alerts to the agent
|
||||
- match_re:
|
||||
alertname: ^(KubePodCrashLooping|KubePodNotReady|KubeContainerWaiting)$
|
||||
receiver: 'k8s-agent'
|
||||
continue: true
|
||||
|
||||
# Route node-related alerts to the agent
|
||||
- match_re:
|
||||
alertname: ^(KubeNodeNotReady|KubeNodeUnreachable|NodeMemoryHighUsage)$
|
||||
receiver: 'k8s-agent'
|
||||
continue: true
|
||||
|
||||
# Route resource alerts to the agent
|
||||
- match_re:
|
||||
alertname: ^(KubeMemoryOvercommit|KubeCPUOvercommit|KubePersistentVolumeFillingUp)$
|
||||
receiver: 'k8s-agent'
|
||||
continue: true
|
||||
|
||||
receivers:
|
||||
- name: 'default'
|
||||
# Your default receiver (email, slack, etc.)
|
||||
|
||||
- name: 'k8s-agent'
|
||||
webhook_configs:
|
||||
# Option 1: Dashboard webhook endpoint
|
||||
- url: 'http://k8s-agent-dashboard.k8s-agent.svc.cluster.local/api/webhooks/alertmanager'
|
||||
send_resolved: true
|
||||
|
||||
# Option 2: External webhook receiver (if dashboard not available)
|
||||
# - url: 'http://your-webhook-host:9000/alerts'
|
||||
# send_resolved: true
|
||||
|
||||
# Inhibit rules - prevent duplicate alerts
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'namespace']
|
||||
Reference in New Issue
Block a user