feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
This commit is contained in:
169
internal/collectors/k8s/issues_pods.go
Normal file
169
internal/collectors/k8s/issues_pods.go
Normal file
@@ -0,0 +1,169 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// IssuesFromPods applies the PLAN.md pod rules.
|
||||
//
|
||||
// Pure rule function: it does not talk to the API server.
|
||||
func IssuesFromPods(pods []*corev1.Pod, now time.Time, pendingGrace time.Duration, crashLoopRestartThreshold int) []model.Issue {
|
||||
if crashLoopRestartThreshold <= 0 {
|
||||
crashLoopRestartThreshold = 5
|
||||
}
|
||||
if pendingGrace <= 0 {
|
||||
pendingGrace = 120 * time.Second
|
||||
}
|
||||
|
||||
out := make([]model.Issue, 0, 32)
|
||||
for _, p := range pods {
|
||||
if p == nil {
|
||||
continue
|
||||
}
|
||||
ns, name := p.Namespace, p.Name
|
||||
|
||||
// Pending for too long.
|
||||
if p.Status.Phase == corev1.PodPending {
|
||||
age := now.Sub(p.CreationTimestamp.Time)
|
||||
if !p.CreationTimestamp.IsZero() && age >= pendingGrace {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:Pending", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("Pod Pending: %s/%s", ns, name),
|
||||
Details: fmt.Sprintf("Pod has been Pending for %s.", age.Truncate(time.Second)),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "Pending",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"phase": string(p.Status.Phase),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Container-derived signals.
|
||||
for _, cs := range p.Status.ContainerStatuses {
|
||||
cname := cs.Name
|
||||
restarts := int(cs.RestartCount)
|
||||
|
||||
// CrashLoopBackOff and pull errors are reported via Waiting state.
|
||||
if cs.State.Waiting != nil {
|
||||
reason := cs.State.Waiting.Reason
|
||||
msg := cs.State.Waiting.Message
|
||||
switch reason {
|
||||
case "CrashLoopBackOff":
|
||||
pri := model.PriorityP1
|
||||
if restarts >= crashLoopRestartThreshold {
|
||||
pri = model.PriorityP0
|
||||
}
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:CrashLoop:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: pri,
|
||||
Title: fmt.Sprintf("CrashLoopBackOff: %s/%s (%s)", ns, name, cname),
|
||||
Details: firstNonEmpty(msg, "Container is in CrashLoopBackOff."),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "CrashLoopBackOff",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
|
||||
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
|
||||
})
|
||||
|
||||
case "ImagePullBackOff", "ErrImagePull":
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:ImagePull:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("%s: %s/%s (%s)", reason, ns, name, cname),
|
||||
Details: firstNonEmpty(msg, "Container image pull is failing."),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": reason,
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// OOMKilled is typically stored in LastTerminationState.
|
||||
if cs.LastTerminationState.Terminated != nil {
|
||||
term := cs.LastTerminationState.Terminated
|
||||
if term.Reason == "OOMKilled" {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:OOMKilled:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("OOMKilled: %s/%s (%s)", ns, name, cname),
|
||||
Details: firstNonEmpty(term.Message, "Container was killed due to OOM."),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "OOMKilled",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
|
||||
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// High restarts even if running.
|
||||
// Keep this lower priority than active CrashLoopBackOff.
|
||||
if restarts >= crashLoopRestartThreshold {
|
||||
if cs.State.Waiting == nil || cs.State.Waiting.Reason == "" {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:Restarts:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP2,
|
||||
Title: fmt.Sprintf("High restarts: %s/%s (%s)", ns, name, cname),
|
||||
Details: "Container has restarted multiple times.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "HighRestarts",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(v, fallback string) string {
|
||||
if strings.TrimSpace(v) != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
Reference in New Issue
Block a user