Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
170 lines
5.3 KiB
Go
170 lines
5.3 KiB
Go
package k8s
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
corev1 "k8s.io/api/core/v1"
|
|
|
|
"tower/internal/model"
|
|
)
|
|
|
|
// IssuesFromPods applies the PLAN.md pod rules.
|
|
//
|
|
// Pure rule function: it does not talk to the API server.
|
|
func IssuesFromPods(pods []*corev1.Pod, now time.Time, pendingGrace time.Duration, crashLoopRestartThreshold int) []model.Issue {
|
|
if crashLoopRestartThreshold <= 0 {
|
|
crashLoopRestartThreshold = 5
|
|
}
|
|
if pendingGrace <= 0 {
|
|
pendingGrace = 120 * time.Second
|
|
}
|
|
|
|
out := make([]model.Issue, 0, 32)
|
|
for _, p := range pods {
|
|
if p == nil {
|
|
continue
|
|
}
|
|
ns, name := p.Namespace, p.Name
|
|
|
|
// Pending for too long.
|
|
if p.Status.Phase == corev1.PodPending {
|
|
age := now.Sub(p.CreationTimestamp.Time)
|
|
if !p.CreationTimestamp.IsZero() && age >= pendingGrace {
|
|
out = append(out, model.Issue{
|
|
ID: fmt.Sprintf("k8s:pod:%s/%s:Pending", ns, name),
|
|
Category: model.CategoryKubernetes,
|
|
Priority: model.PriorityP1,
|
|
Title: fmt.Sprintf("Pod Pending: %s/%s", ns, name),
|
|
Details: fmt.Sprintf("Pod has been Pending for %s.", age.Truncate(time.Second)),
|
|
Evidence: map[string]string{
|
|
"kind": "Pod",
|
|
"reason": "Pending",
|
|
"namespace": ns,
|
|
"pod": name,
|
|
"phase": string(p.Status.Phase),
|
|
"node": p.Spec.NodeName,
|
|
},
|
|
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
|
})
|
|
}
|
|
}
|
|
|
|
// Container-derived signals.
|
|
for _, cs := range p.Status.ContainerStatuses {
|
|
cname := cs.Name
|
|
restarts := int(cs.RestartCount)
|
|
|
|
// CrashLoopBackOff and pull errors are reported via Waiting state.
|
|
if cs.State.Waiting != nil {
|
|
reason := cs.State.Waiting.Reason
|
|
msg := cs.State.Waiting.Message
|
|
switch reason {
|
|
case "CrashLoopBackOff":
|
|
pri := model.PriorityP1
|
|
if restarts >= crashLoopRestartThreshold {
|
|
pri = model.PriorityP0
|
|
}
|
|
out = append(out, model.Issue{
|
|
ID: fmt.Sprintf("k8s:pod:%s/%s:CrashLoop:%s", ns, name, cname),
|
|
Category: model.CategoryKubernetes,
|
|
Priority: pri,
|
|
Title: fmt.Sprintf("CrashLoopBackOff: %s/%s (%s)", ns, name, cname),
|
|
Details: firstNonEmpty(msg, "Container is in CrashLoopBackOff."),
|
|
Evidence: map[string]string{
|
|
"kind": "Pod",
|
|
"reason": "CrashLoopBackOff",
|
|
"namespace": ns,
|
|
"pod": name,
|
|
"container": cname,
|
|
"restarts": strconv.Itoa(restarts),
|
|
"node": p.Spec.NodeName,
|
|
},
|
|
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
|
|
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
|
|
})
|
|
|
|
case "ImagePullBackOff", "ErrImagePull":
|
|
out = append(out, model.Issue{
|
|
ID: fmt.Sprintf("k8s:pod:%s/%s:ImagePull:%s", ns, name, cname),
|
|
Category: model.CategoryKubernetes,
|
|
Priority: model.PriorityP1,
|
|
Title: fmt.Sprintf("%s: %s/%s (%s)", reason, ns, name, cname),
|
|
Details: firstNonEmpty(msg, "Container image pull is failing."),
|
|
Evidence: map[string]string{
|
|
"kind": "Pod",
|
|
"reason": reason,
|
|
"namespace": ns,
|
|
"pod": name,
|
|
"container": cname,
|
|
"restarts": strconv.Itoa(restarts),
|
|
"node": p.Spec.NodeName,
|
|
},
|
|
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
|
})
|
|
}
|
|
}
|
|
|
|
// OOMKilled is typically stored in LastTerminationState.
|
|
if cs.LastTerminationState.Terminated != nil {
|
|
term := cs.LastTerminationState.Terminated
|
|
if term.Reason == "OOMKilled" {
|
|
out = append(out, model.Issue{
|
|
ID: fmt.Sprintf("k8s:pod:%s/%s:OOMKilled:%s", ns, name, cname),
|
|
Category: model.CategoryKubernetes,
|
|
Priority: model.PriorityP1,
|
|
Title: fmt.Sprintf("OOMKilled: %s/%s (%s)", ns, name, cname),
|
|
Details: firstNonEmpty(term.Message, "Container was killed due to OOM."),
|
|
Evidence: map[string]string{
|
|
"kind": "Pod",
|
|
"reason": "OOMKilled",
|
|
"namespace": ns,
|
|
"pod": name,
|
|
"container": cname,
|
|
"restarts": strconv.Itoa(restarts),
|
|
"node": p.Spec.NodeName,
|
|
},
|
|
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
|
|
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
|
|
})
|
|
}
|
|
}
|
|
|
|
// High restarts even if running.
|
|
// Keep this lower priority than active CrashLoopBackOff.
|
|
if restarts >= crashLoopRestartThreshold {
|
|
if cs.State.Waiting == nil || cs.State.Waiting.Reason == "" {
|
|
out = append(out, model.Issue{
|
|
ID: fmt.Sprintf("k8s:pod:%s/%s:Restarts:%s", ns, name, cname),
|
|
Category: model.CategoryKubernetes,
|
|
Priority: model.PriorityP2,
|
|
Title: fmt.Sprintf("High restarts: %s/%s (%s)", ns, name, cname),
|
|
Details: "Container has restarted multiple times.",
|
|
Evidence: map[string]string{
|
|
"kind": "Pod",
|
|
"reason": "HighRestarts",
|
|
"namespace": ns,
|
|
"pod": name,
|
|
"container": cname,
|
|
"restarts": strconv.Itoa(restarts),
|
|
"node": p.Spec.NodeName,
|
|
},
|
|
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
func firstNonEmpty(v, fallback string) string {
|
|
if strings.TrimSpace(v) != "" {
|
|
return v
|
|
}
|
|
return fallback
|
|
}
|