Files
porthole/internal/collectors/k8s/issues_pods.go
OpenCode Test 1421b4659e feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host
systems. Features include:

Core features:
- Collector framework with concurrent scheduling
- Host collectors: disk, memory, load, network
- Kubernetes collectors: pods, nodes, workloads, events with informers
- Issue deduplication, state management, and resolve-after logic
- Bubble Tea TUI with table view, details pane, and filtering
- JSON export functionality

UX improvements:
- Help overlay with keybindings
- Priority/category filters with visual indicators
- Direct priority jump (0/1/2/3)
- Bulk acknowledge (Shift+A)
- Clipboard copy (y)
- Theme toggle (T)
- Age format toggle (d)
- Wide title toggle (t)
- Vi-style navigation (j/k)
- Home/End jump (g/G)
- Rollup drill-down in details

Robustness:
- Grace period for unreachable clusters
- Rollups for high-volume issues
- Flap suppression
- RBAC error handling

Files: All core application code with tests for host collectors,
engine, store, model, and export packages.
2025-12-24 13:29:51 -08:00

170 lines
5.3 KiB
Go

package k8s
import (
"fmt"
"strconv"
"strings"
"time"
corev1 "k8s.io/api/core/v1"
"tower/internal/model"
)
// IssuesFromPods applies the PLAN.md pod rules.
//
// Pure rule function: it does not talk to the API server.
func IssuesFromPods(pods []*corev1.Pod, now time.Time, pendingGrace time.Duration, crashLoopRestartThreshold int) []model.Issue {
if crashLoopRestartThreshold <= 0 {
crashLoopRestartThreshold = 5
}
if pendingGrace <= 0 {
pendingGrace = 120 * time.Second
}
out := make([]model.Issue, 0, 32)
for _, p := range pods {
if p == nil {
continue
}
ns, name := p.Namespace, p.Name
// Pending for too long.
if p.Status.Phase == corev1.PodPending {
age := now.Sub(p.CreationTimestamp.Time)
if !p.CreationTimestamp.IsZero() && age >= pendingGrace {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:Pending", ns, name),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("Pod Pending: %s/%s", ns, name),
Details: fmt.Sprintf("Pod has been Pending for %s.", age.Truncate(time.Second)),
Evidence: map[string]string{
"kind": "Pod",
"reason": "Pending",
"namespace": ns,
"pod": name,
"phase": string(p.Status.Phase),
"node": p.Spec.NodeName,
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
})
}
}
// Container-derived signals.
for _, cs := range p.Status.ContainerStatuses {
cname := cs.Name
restarts := int(cs.RestartCount)
// CrashLoopBackOff and pull errors are reported via Waiting state.
if cs.State.Waiting != nil {
reason := cs.State.Waiting.Reason
msg := cs.State.Waiting.Message
switch reason {
case "CrashLoopBackOff":
pri := model.PriorityP1
if restarts >= crashLoopRestartThreshold {
pri = model.PriorityP0
}
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:CrashLoop:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: pri,
Title: fmt.Sprintf("CrashLoopBackOff: %s/%s (%s)", ns, name, cname),
Details: firstNonEmpty(msg, "Container is in CrashLoopBackOff."),
Evidence: map[string]string{
"kind": "Pod",
"reason": "CrashLoopBackOff",
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
})
case "ImagePullBackOff", "ErrImagePull":
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:ImagePull:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("%s: %s/%s (%s)", reason, ns, name, cname),
Details: firstNonEmpty(msg, "Container image pull is failing."),
Evidence: map[string]string{
"kind": "Pod",
"reason": reason,
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
})
}
}
// OOMKilled is typically stored in LastTerminationState.
if cs.LastTerminationState.Terminated != nil {
term := cs.LastTerminationState.Terminated
if term.Reason == "OOMKilled" {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:OOMKilled:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("OOMKilled: %s/%s (%s)", ns, name, cname),
Details: firstNonEmpty(term.Message, "Container was killed due to OOM."),
Evidence: map[string]string{
"kind": "Pod",
"reason": "OOMKilled",
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
})
}
}
// High restarts even if running.
// Keep this lower priority than active CrashLoopBackOff.
if restarts >= crashLoopRestartThreshold {
if cs.State.Waiting == nil || cs.State.Waiting.Reason == "" {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:Restarts:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: model.PriorityP2,
Title: fmt.Sprintf("High restarts: %s/%s (%s)", ns, name, cname),
Details: "Container has restarted multiple times.",
Evidence: map[string]string{
"kind": "Pod",
"reason": "HighRestarts",
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
})
}
}
}
}
return out
}
func firstNonEmpty(v, fallback string) string {
if strings.TrimSpace(v) != "" {
return v
}
return fallback
}