name: pod-crashloop-remediation description: Diagnose and remediate pods in CrashLoopBackOff version: "1.0" trigger: - alert: match: alertname: KubePodCrashLooping - manual: true inputs: - name: namespace description: Pod namespace required: true - name: pod description: Pod name (or prefix) required: true defaults: model: sonnet steps: - name: identify-pod agent: k8s-diagnostician model: haiku task: | Identify the crashing pod: - Namespace: {{ inputs.namespace | default(alert.labels.namespace) }} - Pod: {{ inputs.pod | default(alert.labels.pod) }} Get pod details: - Current status and restart count - Last restart reason - Container statuses output: pod_info - name: analyze-logs agent: k8s-diagnostician model: sonnet task: | Analyze pod logs for crash cause: - Get current container logs (last 50 lines) - Get previous container logs if available - Look for error patterns: - OOMKilled (exit code 137) - Segfault (exit code 139) - Application errors - Configuration errors - Dependency failures Pod info: {{ steps.identify-pod.output }} output: log_analysis - name: check-resources agent: prometheus-analyst model: haiku task: | Check resource usage before crash: - Memory usage trend (last 30 min) - CPU usage trend (last 30 min) - Compare to resource limits Pod: {{ steps.identify-pod.output.pod_name }} Namespace: {{ steps.identify-pod.output.namespace }} output: resource_analysis - name: check-dependencies agent: k8s-diagnostician model: haiku task: | Check pod dependencies: - ConfigMaps and Secrets exist? - PVCs bound? - Service account valid? - Init containers completed? Pod info: {{ steps.identify-pod.output }} output: dependency_check - name: diagnose-and-recommend agent: k8s-orchestrator model: sonnet task: | Analyze all findings and determine root cause: Evidence: - Pod info: {{ steps.identify-pod.output }} - Log analysis: {{ steps.analyze-logs.output }} - Resource usage: {{ steps.check-resources.output }} - Dependencies: {{ steps.check-dependencies.output }} Determine: 1. Root cause (OOM, config error, dependency, application bug, etc.) 2. Severity (auto-recoverable, needs intervention, critical) 3. Recommended actions Action classification: - [SAFE] Restart pod, clear stuck jobs - [CONFIRM] Increase resources, modify config - [FORBIDDEN] Delete PVC, delete namespace output: diagnosis - name: apply-safe-remediation condition: "{{ steps.diagnose-and-recommend.output.has_safe_action }}" agent: k8s-diagnostician model: haiku task: | Apply safe remediation actions: {{ steps.diagnose-and-recommend.output.safe_actions }} Report what was done. output: safe_actions_result - name: propose-confirm-actions condition: "{{ steps.diagnose-and-recommend.output.has_confirm_action }}" agent: k8s-orchestrator model: haiku task: | Present actions requiring confirmation: {{ steps.diagnose-and-recommend.output.confirm_actions }} For each action, explain: - What will change - Potential impact - Rollback option output: confirm_proposal confirm: true outputs: - diagnosis - safe_actions_result - confirm_proposal notifications: on_complete: summary: | CrashLoop remediation for {{ steps.identify-pod.output.pod_name }}: - Root cause: {{ steps.diagnose-and-recommend.output.root_cause }} - Actions taken: {{ steps.safe_actions_result.actions | default('none') }} - Pending approval: {{ steps.confirm_proposal | default('none') }}