feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
This commit is contained in:
174
internal/collectors/k8s/issues_workloads.go
Normal file
174
internal/collectors/k8s/issues_workloads.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// WorkloadGrace tracks how long a workload must be NotReady before we emit an issue.
|
||||
const defaultWorkloadNotReadyGrace = 180 * time.Second
|
||||
|
||||
// IssuesFromDeployments applies the PLAN.md workload rules for Deployments.
|
||||
func IssuesFromDeployments(deploys []*appsv1.Deployment, now time.Time, grace time.Duration) []model.Issue {
|
||||
if grace <= 0 {
|
||||
grace = defaultWorkloadNotReadyGrace
|
||||
}
|
||||
out := make([]model.Issue, 0, 16)
|
||||
|
||||
for _, d := range deploys {
|
||||
if d == nil {
|
||||
continue
|
||||
}
|
||||
desired := int32(1)
|
||||
if d.Spec.Replicas != nil {
|
||||
desired = *d.Spec.Replicas
|
||||
}
|
||||
ready := d.Status.ReadyReplicas
|
||||
if desired > 0 && ready < desired {
|
||||
// Prefer LastUpdateTime / LastTransitionTime when available; fallback to creation time.
|
||||
since := d.CreationTimestamp.Time
|
||||
if cond := findDeploymentProgressingCondition(d); cond != nil {
|
||||
if !cond.LastUpdateTime.IsZero() {
|
||||
since = cond.LastUpdateTime.Time
|
||||
} else if !cond.LastTransitionTime.IsZero() {
|
||||
since = cond.LastTransitionTime.Time
|
||||
}
|
||||
}
|
||||
if !since.IsZero() && now.Sub(since) < grace {
|
||||
continue
|
||||
}
|
||||
|
||||
ns := d.Namespace
|
||||
name := d.Name
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:deploy:%s/%s:NotReady", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("Deployment not ready: %s/%s", ns, name),
|
||||
Details: "Ready replicas below desired.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "Deployment",
|
||||
"reason": "NotReady",
|
||||
"namespace": ns,
|
||||
"name": name,
|
||||
"desired": strconv.Itoa(int(desired)),
|
||||
"ready": strconv.Itoa(int(ready)),
|
||||
"observed_gen": strconv.FormatInt(d.Status.ObservedGeneration, 10),
|
||||
"resource_gen": strconv.FormatInt(d.Generation, 10),
|
||||
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe deployment %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// IssuesFromStatefulSets applies the PLAN.md workload rules for StatefulSets.
|
||||
func IssuesFromStatefulSets(sts []*appsv1.StatefulSet, now time.Time, grace time.Duration) []model.Issue {
|
||||
if grace <= 0 {
|
||||
grace = defaultWorkloadNotReadyGrace
|
||||
}
|
||||
out := make([]model.Issue, 0, 16)
|
||||
|
||||
for _, s := range sts {
|
||||
if s == nil {
|
||||
continue
|
||||
}
|
||||
desired := int32(1)
|
||||
if s.Spec.Replicas != nil {
|
||||
desired = *s.Spec.Replicas
|
||||
}
|
||||
ready := s.Status.ReadyReplicas
|
||||
if desired > 0 && ready < desired {
|
||||
since := s.CreationTimestamp.Time
|
||||
if !since.IsZero() && now.Sub(since) < grace {
|
||||
continue
|
||||
}
|
||||
|
||||
ns, name := s.Namespace, s.Name
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:sts:%s/%s:NotReady", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("StatefulSet not ready: %s/%s", ns, name),
|
||||
Details: "Ready replicas below desired.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "StatefulSet",
|
||||
"reason": "NotReady",
|
||||
"namespace": ns,
|
||||
"name": name,
|
||||
"desired": strconv.Itoa(int(desired)),
|
||||
"ready": strconv.Itoa(int(ready)),
|
||||
"observed_gen": strconv.FormatInt(s.Status.ObservedGeneration, 10),
|
||||
"resource_gen": strconv.FormatInt(s.Generation, 10),
|
||||
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe statefulset %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// IssuesFromDaemonSets applies the PLAN.md workload rules for DaemonSets.
|
||||
func IssuesFromDaemonSets(dss []*appsv1.DaemonSet, now time.Time, grace time.Duration) []model.Issue {
|
||||
if grace <= 0 {
|
||||
grace = defaultWorkloadNotReadyGrace
|
||||
}
|
||||
out := make([]model.Issue, 0, 16)
|
||||
|
||||
for _, ds := range dss {
|
||||
if ds == nil {
|
||||
continue
|
||||
}
|
||||
unavailable := ds.Status.NumberUnavailable
|
||||
if unavailable > 0 {
|
||||
since := ds.CreationTimestamp.Time
|
||||
if !since.IsZero() && now.Sub(since) < grace {
|
||||
continue
|
||||
}
|
||||
ns, name := ds.Namespace, ds.Name
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:ds:%s/%s:Unavailable", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("DaemonSet unavailable: %s/%s", ns, name),
|
||||
Details: "DaemonSet has unavailable pods.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "DaemonSet",
|
||||
"reason": "Unavailable",
|
||||
"namespace": ns,
|
||||
"name": name,
|
||||
"unavailable": strconv.Itoa(int(unavailable)),
|
||||
"desired": strconv.Itoa(int(ds.Status.DesiredNumberScheduled)),
|
||||
"available": strconv.Itoa(int(ds.Status.NumberAvailable)),
|
||||
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe daemonset %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func findDeploymentProgressingCondition(d *appsv1.Deployment) *appsv1.DeploymentCondition {
|
||||
if d == nil {
|
||||
return nil
|
||||
}
|
||||
for i := range d.Status.Conditions {
|
||||
c := &d.Status.Conditions[i]
|
||||
if c.Type == appsv1.DeploymentProgressing {
|
||||
return c
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user