feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions
--- a/internal/collectors/k8s/issues_workloads.go
+++ b/internal/collectors/k8s/issues_workloads.go
@@ -0,0 +1,174 @@
+package k8s
+
+import (
+	"fmt"
+	"strconv"
+	"time"
+
+	appsv1 "k8s.io/api/apps/v1"
+
+	"tower/internal/model"
+)
+
+// WorkloadGrace tracks how long a workload must be NotReady before we emit an issue.
+const defaultWorkloadNotReadyGrace = 180 * time.Second
+
+// IssuesFromDeployments applies the PLAN.md workload rules for Deployments.
+func IssuesFromDeployments(deploys []*appsv1.Deployment, now time.Time, grace time.Duration) []model.Issue {
+	if grace <= 0 {
+		grace = defaultWorkloadNotReadyGrace
+	}
+	out := make([]model.Issue, 0, 16)
+
+	for _, d := range deploys {
+		if d == nil {
+			continue
+		}
+		desired := int32(1)
+		if d.Spec.Replicas != nil {
+			desired = *d.Spec.Replicas
+		}
+		ready := d.Status.ReadyReplicas
+		if desired > 0 && ready < desired {
+			// Prefer LastUpdateTime / LastTransitionTime when available; fallback to creation time.
+			since := d.CreationTimestamp.Time
+			if cond := findDeploymentProgressingCondition(d); cond != nil {
+				if !cond.LastUpdateTime.IsZero() {
+					since = cond.LastUpdateTime.Time
+				} else if !cond.LastTransitionTime.IsZero() {
+					since = cond.LastTransitionTime.Time
+				}
+			}
+			if !since.IsZero() && now.Sub(since) < grace {
+				continue
+			}
+
+			ns := d.Namespace
+			name := d.Name
+			out = append(out, model.Issue{
+				ID:       fmt.Sprintf("k8s:deploy:%s/%s:NotReady", ns, name),
+				Category: model.CategoryKubernetes,
+				Priority: model.PriorityP1,
+				Title:    fmt.Sprintf("Deployment not ready: %s/%s", ns, name),
+				Details:  "Ready replicas below desired.",
+				Evidence: map[string]string{
+					"kind":          "Deployment",
+					"reason":        "NotReady",
+					"namespace":     ns,
+					"name":          name,
+					"desired":       strconv.Itoa(int(desired)),
+					"ready":         strconv.Itoa(int(ready)),
+					"observed_gen":  strconv.FormatInt(d.Status.ObservedGeneration, 10),
+					"resource_gen":  strconv.FormatInt(d.Generation, 10),
+					"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
+				},
+				SuggestedFix: fmt.Sprintf("kubectl -n %s describe deployment %s", ns, name),
+			})
+		}
+	}
+
+	return out
+}
+
+// IssuesFromStatefulSets applies the PLAN.md workload rules for StatefulSets.
+func IssuesFromStatefulSets(sts []*appsv1.StatefulSet, now time.Time, grace time.Duration) []model.Issue {
+	if grace <= 0 {
+		grace = defaultWorkloadNotReadyGrace
+	}
+	out := make([]model.Issue, 0, 16)
+
+	for _, s := range sts {
+		if s == nil {
+			continue
+		}
+		desired := int32(1)
+		if s.Spec.Replicas != nil {
+			desired = *s.Spec.Replicas
+		}
+		ready := s.Status.ReadyReplicas
+		if desired > 0 && ready < desired {
+			since := s.CreationTimestamp.Time
+			if !since.IsZero() && now.Sub(since) < grace {
+				continue
+			}
+
+			ns, name := s.Namespace, s.Name
+			out = append(out, model.Issue{
+				ID:       fmt.Sprintf("k8s:sts:%s/%s:NotReady", ns, name),
+				Category: model.CategoryKubernetes,
+				Priority: model.PriorityP1,
+				Title:    fmt.Sprintf("StatefulSet not ready: %s/%s", ns, name),
+				Details:  "Ready replicas below desired.",
+				Evidence: map[string]string{
+					"kind":          "StatefulSet",
+					"reason":        "NotReady",
+					"namespace":     ns,
+					"name":          name,
+					"desired":       strconv.Itoa(int(desired)),
+					"ready":         strconv.Itoa(int(ready)),
+					"observed_gen":  strconv.FormatInt(s.Status.ObservedGeneration, 10),
+					"resource_gen":  strconv.FormatInt(s.Generation, 10),
+					"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
+				},
+				SuggestedFix: fmt.Sprintf("kubectl -n %s describe statefulset %s", ns, name),
+			})
+		}
+	}
+
+	return out
+}
+
+// IssuesFromDaemonSets applies the PLAN.md workload rules for DaemonSets.
+func IssuesFromDaemonSets(dss []*appsv1.DaemonSet, now time.Time, grace time.Duration) []model.Issue {
+	if grace <= 0 {
+		grace = defaultWorkloadNotReadyGrace
+	}
+	out := make([]model.Issue, 0, 16)
+
+	for _, ds := range dss {
+		if ds == nil {
+			continue
+		}
+		unavailable := ds.Status.NumberUnavailable
+		if unavailable > 0 {
+			since := ds.CreationTimestamp.Time
+			if !since.IsZero() && now.Sub(since) < grace {
+				continue
+			}
+			ns, name := ds.Namespace, ds.Name
+			out = append(out, model.Issue{
+				ID:       fmt.Sprintf("k8s:ds:%s/%s:Unavailable", ns, name),
+				Category: model.CategoryKubernetes,
+				Priority: model.PriorityP1,
+				Title:    fmt.Sprintf("DaemonSet unavailable: %s/%s", ns, name),
+				Details:  "DaemonSet has unavailable pods.",
+				Evidence: map[string]string{
+					"kind":          "DaemonSet",
+					"reason":        "Unavailable",
+					"namespace":     ns,
+					"name":          name,
+					"unavailable":   strconv.Itoa(int(unavailable)),
+					"desired":       strconv.Itoa(int(ds.Status.DesiredNumberScheduled)),
+					"available":     strconv.Itoa(int(ds.Status.NumberAvailable)),
+					"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
+				},
+				SuggestedFix: fmt.Sprintf("kubectl -n %s describe daemonset %s", ns, name),
+			})
+		}
+	}
+
+	return out
+}
+
+func findDeploymentProgressingCondition(d *appsv1.Deployment) *appsv1.DeploymentCondition {
+	if d == nil {
+		return nil
+	}
+	for i := range d.Status.Conditions {
+		c := &d.Status.Conditions[i]
+		if c.Type == appsv1.DeploymentProgressing {
+			return c
+		}
+	}
+	return nil
+}