feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions
--- a/internal/collectors/k8s/client.go
+++ b/internal/collectors/k8s/client.go
@@ -0,0 +1,88 @@
+package k8s
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/clientcmd"
+)
+
+// ClientFromCurrentContext creates a Kubernetes client-go Clientset using the
+// user's kubeconfig current context.
+//
+// It is a pure helper (no global state) so it can be used by collectors and
+// unit tests (with temporary kubeconfig files).
+func ClientFromCurrentContext() (*kubernetes.Clientset, *rest.Config, error) {
+	loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
+
+	// Respect KUBECONFIG semantics (it may be a path list).
+	if p := os.Getenv("KUBECONFIG"); p != "" {
+		if list := filepath.SplitList(p); len(list) > 1 {
+			loadingRules.ExplicitPath = ""
+			loadingRules.Precedence = list
+		} else {
+			loadingRules.ExplicitPath = p
+		}
+	}
+
+	cfg := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, &clientcmd.ConfigOverrides{})
+	restCfg, err := cfg.ClientConfig()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Ensure HTTP client timeouts are bounded. LIST fallback uses its own context
+	// timeouts, but this provides a safety net.
+	if restCfg.Timeout <= 0 {
+		restCfg.Timeout = 30 * time.Second
+	}
+
+	cs, err := kubernetes.NewForConfig(restCfg)
+	if err != nil {
+		return nil, nil, err
+	}
+	return cs, restCfg, nil
+}
+
+func defaultKubeconfigPath() string {
+	// This helper is used only for existence checks / UI messages. Client loading
+	// should use client-go's default loading rules.
+	if p := os.Getenv("KUBECONFIG"); p != "" {
+		// If KUBECONFIG is a list, return the first entry for display.
+		if list := filepath.SplitList(p); len(list) > 0 {
+			return list[0]
+		}
+		return p
+	}
+
+	h, err := os.UserHomeDir()
+	if err != nil {
+		return ""
+	}
+	return filepath.Join(h, ".kube", "config")
+}
+
+// Ping performs a lightweight API call to determine if the cluster is reachable
+// and authentication works.
+func Ping(ctx context.Context, cs kubernetes.Interface) error {
+	if cs == nil {
+		return errors.New("nil kubernetes client")
+	}
+	_, err := cs.Discovery().ServerVersion()
+	if err != nil {
+		// Treat authn/authz errors separately so callers can decide whether to
+		// surface "unreachable" vs "insufficient credentials".
+		if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) {
+			return fmt.Errorf("discovery auth: %w", err)
+		}
+		return fmt.Errorf("discovery server version: %w", err)
+	}
+	return nil
+}
--- a/internal/collectors/k8s/informers.go
+++ b/internal/collectors/k8s/informers.go
@@ -0,0 +1,720 @@
+package k8s
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"sync"
+	"time"
+
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/client-go/informers"
+	"k8s.io/client-go/kubernetes"
+	appslisters "k8s.io/client-go/listers/apps/v1"
+	corelisters "k8s.io/client-go/listers/core/v1"
+	"k8s.io/client-go/tools/cache"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+// Collector is the ControlTower Kubernetes collector.
+//
+// It uses client-go informers (LIST+WATCH with local caches) against the user's
+// kubeconfig current context, across all namespaces.
+//
+// Degradation behavior:
+//   - If WATCH fails repeatedly, it falls back to polling LIST and emits a P1
+//     "degraded to polling" issue.
+//   - While in polling mode, it periodically attempts to recover back to watches.
+//   - If the cluster is unreachable, it emits a P0 only after 10s continuous failure.
+//   - If RBAC forbids list/watch for a resource, it emits a single P2 issue per
+//     inaccessible resource and continues for accessible resources.
+//
+// Noise control:
+//   - Rollups group by (namespace, reason, kind) when group size >= 20.
+//   - Cap max issues to 200 after rollups.
+//
+// Instantiate with NewCollector().
+type Collector struct {
+	interval time.Duration
+
+	unreachableGrace time.Duration
+	pendingGrace     time.Duration
+	workloadGrace    time.Duration
+	crashLoopThresh  int
+
+	rollupThreshold int
+	maxIssues       int
+
+	watchFailureThreshold int
+	watchFailureWindow    time.Duration
+	pollRecoverEvery      time.Duration
+
+	mu     sync.Mutex
+	syncWG sync.WaitGroup
+
+	client kubernetes.Interface
+
+	factory   informers.SharedInformerFactory
+	stopCh    chan struct{}
+	started   bool
+	syncedFns []cache.InformerSynced
+
+	podsLister        corelisters.PodLister
+	nodesLister       corelisters.NodeLister
+	eventsLister      corelisters.EventLister
+	deployLister      appslisters.DeploymentLister
+	statefulSetLister appslisters.StatefulSetLister
+	daemonSetLister   appslisters.DaemonSetLister
+
+	// polling indicates we have degraded from informers to list polling.
+	polling                bool
+	pollSince              time.Time
+	lastPollRecoverAttempt time.Time
+
+	watchFailWindowStart time.Time
+	watchFailCount       int
+
+	// rbacDenied is keyed by resource name ("pods", "nodes", ...).
+	rbacDenied map[string]error
+
+	unreach *unreachableTracker
+
+	lastSuccess time.Time
+}
+
+func NewCollector() *Collector {
+	c := &Collector{
+		interval:              2 * time.Second,
+		unreachableGrace:      10 * time.Second,
+		pendingGrace:          120 * time.Second,
+		workloadGrace:         180 * time.Second,
+		crashLoopThresh:       5,
+		rollupThreshold:       20,
+		maxIssues:             200,
+		watchFailureThreshold: 5,
+		watchFailureWindow:    30 * time.Second,
+		pollRecoverEvery:      30 * time.Second,
+		rbacDenied:            map[string]error{},
+	}
+	c.unreach = newUnreachableTracker(c.unreachableGrace)
+	return c
+}
+
+var _ collectors.Collector = (*Collector)(nil)
+
+func (c *Collector) Name() string { return "k8s" }
+
+func (c *Collector) Interval() time.Duration {
+	if c.interval <= 0 {
+		return 2 * time.Second
+	}
+	return c.interval
+}
+
+func (c *Collector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
+	now := time.Now()
+	if err := ctx.Err(); err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+	}
+
+	// If kubeconfig doesn't exist, treat Kubernetes as "disabled".
+	if !kubeconfigExists() {
+		return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "kubeconfig not found"}, nil
+	}
+
+	if err := c.ensureClient(); err != nil {
+		c.unreach.observeFailure(now, err)
+		if c.unreach.shouldEmit(now) {
+			iss := stampIssueTimes(now, unreachableIssue(err))
+			return []model.Issue{iss}, collectors.Status{Health: collectors.HealthError, Message: "unreachable"}, nil
+		}
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "k8s client init failed (grace)"}, nil
+	}
+
+	// Connectivity/auth check with grace.
+	if err := Ping(ctx, c.client); err != nil {
+		c.unreach.observeFailure(now, err)
+		if c.unreach.shouldEmit(now) {
+			iss := stampIssueTimes(now, unreachableIssue(err))
+			return []model.Issue{iss}, collectors.Status{Health: collectors.HealthError, Message: "unreachable"}, nil
+		}
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "k8s unreachable (grace)"}, nil
+	}
+	c.unreach.observeSuccess()
+	c.lastSuccess = now
+
+	// Prefer informers unless currently degraded to polling.
+	if c.isPolling() {
+		c.maybeRecoverInformers(ctx, now)
+	}
+	if !c.isPolling() {
+		_ = c.ensureInformers(ctx)
+	}
+
+	issues := make([]model.Issue, 0, 64)
+	issues = append(issues, c.rbacIssues()...)
+
+	st := collectors.Status{Health: collectors.HealthOK, LastSuccess: c.lastSuccess}
+
+	if c.isPolling() {
+		st.Health = collectors.HealthDegraded
+		st.Message = "degraded to polling"
+		issues = append(issues, stampIssueTimes(now, pollingDegradedIssue()))
+		issues = append(issues, c.collectByPolling(ctx, now)...)
+	} else {
+		// If caches aren't ready, use polling for this tick only.
+		if !c.cachesSyncedQuick(ctx) {
+			st.Health = collectors.HealthDegraded
+			st.Message = "waiting for informer cache; used list"
+			issues = append(issues, c.collectByPolling(ctx, now)...)
+		} else {
+			issues = append(issues, c.collectFromCaches(now)...)
+			if len(c.snapshotRBACDenied()) > 0 {
+				st.Health = collectors.HealthDegraded
+				st.Message = "partial RBAC access"
+			}
+		}
+	}
+
+	// Set timestamps, roll up and cap.
+	for i := range issues {
+		issues[i] = stampIssueTimes(now, issues[i])
+	}
+	issues = Rollup(issues, c.rollupThreshold, 5)
+	model.SortIssuesDefault(issues)
+	issues = CapIssues(issues, c.maxIssues)
+
+	return issues, st, nil
+}
+
+func (c *Collector) ensureClient() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.client != nil {
+		return nil
+	}
+	cs, _, err := ClientFromCurrentContext()
+	if err != nil {
+		return err
+	}
+	c.client = cs
+	return nil
+}
+
+func kubeconfigExists() bool {
+	if p := os.Getenv("KUBECONFIG"); p != "" {
+		for _, fp := range filepath.SplitList(p) {
+			if fp == "" {
+				continue
+			}
+			if _, err := os.Stat(fp); err == nil {
+				return true
+			}
+		}
+		return false
+	}
+
+	p := defaultKubeconfigPath()
+	if p == "" {
+		return false
+	}
+	_, err := os.Stat(p)
+	return err == nil
+}
+
+func (c *Collector) ensureInformers(ctx context.Context) error {
+	c.mu.Lock()
+	if c.started || c.polling {
+		c.mu.Unlock()
+		return nil
+	}
+	client := c.client
+	c.mu.Unlock()
+	if client == nil {
+		return fmt.Errorf("nil kubernetes client")
+	}
+
+	// RBAC preflight before we even construct informers (so we can skip forbidden ones).
+	c.preflightRBAC(ctx, client)
+
+	factory := informers.NewSharedInformerFactory(client, 0)
+
+	var (
+		podsInf  cache.SharedIndexInformer
+		nodesInf cache.SharedIndexInformer
+		evsInf   cache.SharedIndexInformer
+		depInf   cache.SharedIndexInformer
+		stsInf   cache.SharedIndexInformer
+		dsInf    cache.SharedIndexInformer
+	)
+
+	if !c.isRBACDenied("pods") {
+		i := factory.Core().V1().Pods()
+		i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("pods", err) })
+		c.mu.Lock()
+		c.podsLister = i.Lister()
+		c.mu.Unlock()
+		podsInf = i.Informer()
+	}
+	if !c.isRBACDenied("nodes") {
+		i := factory.Core().V1().Nodes()
+		i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("nodes", err) })
+		c.mu.Lock()
+		c.nodesLister = i.Lister()
+		c.mu.Unlock()
+		nodesInf = i.Informer()
+	}
+	if !c.isRBACDenied("events") {
+		i := factory.Core().V1().Events()
+		i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("events", err) })
+		c.mu.Lock()
+		c.eventsLister = i.Lister()
+		c.mu.Unlock()
+		evsInf = i.Informer()
+	}
+	if !c.isRBACDenied("deployments") {
+		i := factory.Apps().V1().Deployments()
+		i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("deployments", err) })
+		c.mu.Lock()
+		c.deployLister = i.Lister()
+		c.mu.Unlock()
+		depInf = i.Informer()
+	}
+	if !c.isRBACDenied("statefulsets") {
+		i := factory.Apps().V1().StatefulSets()
+		i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("statefulsets", err) })
+		c.mu.Lock()
+		c.statefulSetLister = i.Lister()
+		c.mu.Unlock()
+		stsInf = i.Informer()
+	}
+	if !c.isRBACDenied("daemonsets") {
+		i := factory.Apps().V1().DaemonSets()
+		i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("daemonsets", err) })
+		c.mu.Lock()
+		c.daemonSetLister = i.Lister()
+		c.mu.Unlock()
+		dsInf = i.Informer()
+	}
+
+	synced := make([]cache.InformerSynced, 0, 6)
+	if podsInf != nil {
+		synced = append(synced, podsInf.HasSynced)
+	}
+	if nodesInf != nil {
+		synced = append(synced, nodesInf.HasSynced)
+	}
+	if evsInf != nil {
+		synced = append(synced, evsInf.HasSynced)
+	}
+	if depInf != nil {
+		synced = append(synced, depInf.HasSynced)
+	}
+	if stsInf != nil {
+		synced = append(synced, stsInf.HasSynced)
+	}
+	if dsInf != nil {
+		synced = append(synced, dsInf.HasSynced)
+	}
+
+	stopCh := make(chan struct{})
+
+	c.mu.Lock()
+	c.factory = factory
+	c.stopCh = stopCh
+	c.started = true
+	c.syncedFns = synced
+	c.mu.Unlock()
+
+	factory.Start(stopCh)
+
+	c.syncWG.Add(1)
+	go func() {
+		defer c.syncWG.Done()
+		syncCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+		defer cancel()
+		if ok := cache.WaitForCacheSync(syncCtx.Done(), synced...); !ok {
+			fmt.Printf("k8s: informer cache sync failed or timed out\n")
+		}
+	}()
+
+	return nil
+}
+
+func (c *Collector) maybeRecoverInformers(ctx context.Context, now time.Time) {
+	c.mu.Lock()
+	interval := c.pollRecoverEvery
+	last := c.lastPollRecoverAttempt
+	c.mu.Unlock()
+
+	if interval <= 0 {
+		interval = 30 * time.Second
+	}
+	if !last.IsZero() && now.Sub(last) < interval {
+		return
+	}
+
+	c.mu.Lock()
+	c.lastPollRecoverAttempt = now
+	c.mu.Unlock()
+
+	// Only attempt if connectivity is OK (already pinged successfully in Collect).
+	// Reset watch failure counters and exit polling; subsequent Collect will ensureInformers.
+	c.mu.Lock()
+	c.polling = false
+	c.pollSince = time.Time{}
+	c.watchFailWindowStart = time.Time{}
+	c.watchFailCount = 0
+	c.mu.Unlock()
+
+	_ = c.ensureInformers(ctx)
+}
+
+func (c *Collector) preflightRBAC(ctx context.Context, client kubernetes.Interface) {
+	shortCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
+	defer cancel()
+
+	probe := func(resource string, f func(context.Context) error) {
+		if err := f(shortCtx); err != nil {
+			if apierrors.IsForbidden(err) {
+				c.noteRBAC(resource, err)
+			}
+		}
+	}
+
+	probe("nodes", func(ctx context.Context) error {
+		_, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{Limit: 1})
+		return err
+	})
+	probe("pods", func(ctx context.Context) error {
+		_, err := client.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
+		return err
+	})
+	probe("deployments", func(ctx context.Context) error {
+		_, err := client.AppsV1().Deployments(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
+		return err
+	})
+	probe("statefulsets", func(ctx context.Context) error {
+		_, err := client.AppsV1().StatefulSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
+		return err
+	})
+	probe("daemonsets", func(ctx context.Context) error {
+		_, err := client.AppsV1().DaemonSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
+		return err
+	})
+	probe("events", func(ctx context.Context) error {
+		_, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
+		return err
+	})
+}
+
+func (c *Collector) noteRBAC(resource string, err error) {
+	if err == nil || !apierrors.IsForbidden(err) {
+		return
+	}
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if _, ok := c.rbacDenied[resource]; ok {
+		return
+	}
+	c.rbacDenied[resource] = err
+}
+
+func (c *Collector) isRBACDenied(resource string) bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	_, ok := c.rbacDenied[resource]
+	return ok
+}
+
+func (c *Collector) snapshotRBACDenied() map[string]error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	out := make(map[string]error, len(c.rbacDenied))
+	for k, v := range c.rbacDenied {
+		out[k] = v
+	}
+	return out
+}
+
+func (c *Collector) recordWatchError(resource string, err error) {
+	if err == nil {
+		return
+	}
+	if apierrors.IsForbidden(err) {
+		c.noteRBAC(resource, err)
+		return
+	}
+
+	now := time.Now()
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.polling {
+		return
+	}
+	if c.watchFailWindowStart.IsZero() || now.Sub(c.watchFailWindowStart) > c.watchFailureWindow {
+		c.watchFailWindowStart = now
+		c.watchFailCount = 0
+	}
+	c.watchFailCount++
+	if c.watchFailCount >= c.watchFailureThreshold {
+		c.polling = true
+		c.pollSince = now
+		if c.stopCh != nil {
+			close(c.stopCh)
+			c.stopCh = nil
+		}
+		c.started = false
+		c.factory = nil
+		c.syncedFns = nil
+		c.syncWG.Wait()
+	}
+}
+
+func (c *Collector) cachesSyncedQuick(ctx context.Context) bool {
+	c.mu.Lock()
+	synced := append([]cache.InformerSynced(nil), c.syncedFns...)
+	c.mu.Unlock()
+	if len(synced) == 0 {
+		return false
+	}
+
+	syncCtx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
+	defer cancel()
+	return cache.WaitForCacheSync(syncCtx.Done(), synced...)
+}
+
+func (c *Collector) collectFromCaches(now time.Time) []model.Issue {
+	c.mu.Lock()
+	podsLister := c.podsLister
+	nodesLister := c.nodesLister
+	eventsLister := c.eventsLister
+	deployLister := c.deployLister
+	stsLister := c.statefulSetLister
+	dsLister := c.daemonSetLister
+	denied := make(map[string]error, len(c.rbacDenied))
+	for k, v := range c.rbacDenied {
+		denied[k] = v
+	}
+	c.mu.Unlock()
+
+	issues := make([]model.Issue, 0, 64)
+	sel := labels.Everything()
+
+	if _, ok := denied["nodes"]; !ok && nodesLister != nil {
+		if list, err := nodesLister.List(sel); err == nil {
+			nodes := make([]*corev1.Node, 0, len(list))
+			for i := range list {
+				nodes = append(nodes, list[i])
+			}
+			issues = append(issues, IssuesFromNodes(nodes)...)
+		}
+	}
+
+	if _, ok := denied["pods"]; !ok && podsLister != nil {
+		if list, err := podsLister.List(sel); err == nil {
+			pods := make([]*corev1.Pod, 0, len(list))
+			for i := range list {
+				pods = append(pods, list[i])
+			}
+			issues = append(issues, IssuesFromPods(pods, now, c.pendingGrace, c.crashLoopThresh)...)
+		}
+	}
+
+	if _, ok := denied["deployments"]; !ok && deployLister != nil {
+		if list, err := deployLister.List(sel); err == nil {
+			deps := make([]*appsv1.Deployment, 0, len(list))
+			for i := range list {
+				deps = append(deps, list[i])
+			}
+			issues = append(issues, IssuesFromDeployments(deps, now, c.workloadGrace)...)
+		}
+	}
+	if _, ok := denied["statefulsets"]; !ok && stsLister != nil {
+		if list, err := stsLister.List(sel); err == nil {
+			sts := make([]*appsv1.StatefulSet, 0, len(list))
+			for i := range list {
+				sts = append(sts, list[i])
+			}
+			issues = append(issues, IssuesFromStatefulSets(sts, now, c.workloadGrace)...)
+		}
+	}
+	if _, ok := denied["daemonsets"]; !ok && dsLister != nil {
+		if list, err := dsLister.List(sel); err == nil {
+			dss := make([]*appsv1.DaemonSet, 0, len(list))
+			for i := range list {
+				dss = append(dss, list[i])
+			}
+			issues = append(issues, IssuesFromDaemonSets(dss, now, c.workloadGrace)...)
+		}
+	}
+
+	if _, ok := denied["events"]; !ok && eventsLister != nil {
+		if list, err := eventsLister.List(sel); err == nil {
+			es := make([]*corev1.Event, 0, len(list))
+			for i := range list {
+				es = append(es, list[i])
+			}
+			issues = append(issues, IssuesFromEvents(es, now)...)
+		}
+	}
+
+	return issues
+}
+
+func (c *Collector) collectByPolling(ctx context.Context, now time.Time) []model.Issue {
+	c.mu.Lock()
+	client := c.client
+	denied := make(map[string]error, len(c.rbacDenied))
+	for k, v := range c.rbacDenied {
+		denied[k] = v
+	}
+	c.mu.Unlock()
+	if client == nil {
+		return nil
+	}
+
+	issues := make([]model.Issue, 0, 64)
+
+	if _, ok := denied["nodes"]; !ok {
+		if nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}); err != nil {
+			c.noteRBAC("nodes", err)
+		} else {
+			list := make([]*corev1.Node, 0, len(nodes.Items))
+			for i := range nodes.Items {
+				list = append(list, &nodes.Items[i])
+			}
+			issues = append(issues, IssuesFromNodes(list)...)
+		}
+	}
+
+	if _, ok := denied["pods"]; !ok {
+		if pods, err := client.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
+			c.noteRBAC("pods", err)
+		} else {
+			list := make([]*corev1.Pod, 0, len(pods.Items))
+			for i := range pods.Items {
+				list = append(list, &pods.Items[i])
+			}
+			issues = append(issues, IssuesFromPods(list, now, c.pendingGrace, c.crashLoopThresh)...)
+		}
+	}
+
+	if _, ok := denied["deployments"]; !ok {
+		if deps, err := client.AppsV1().Deployments(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
+			c.noteRBAC("deployments", err)
+		} else {
+			list := make([]*appsv1.Deployment, 0, len(deps.Items))
+			for i := range deps.Items {
+				list = append(list, &deps.Items[i])
+			}
+			issues = append(issues, IssuesFromDeployments(list, now, c.workloadGrace)...)
+		}
+	}
+
+	if _, ok := denied["statefulsets"]; !ok {
+		if sts, err := client.AppsV1().StatefulSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
+			c.noteRBAC("statefulsets", err)
+		} else {
+			list := make([]*appsv1.StatefulSet, 0, len(sts.Items))
+			for i := range sts.Items {
+				list = append(list, &sts.Items[i])
+			}
+			issues = append(issues, IssuesFromStatefulSets(list, now, c.workloadGrace)...)
+		}
+	}
+
+	if _, ok := denied["daemonsets"]; !ok {
+		if dss, err := client.AppsV1().DaemonSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
+			c.noteRBAC("daemonsets", err)
+		} else {
+			list := make([]*appsv1.DaemonSet, 0, len(dss.Items))
+			for i := range dss.Items {
+				list = append(list, &dss.Items[i])
+			}
+			issues = append(issues, IssuesFromDaemonSets(list, now, c.workloadGrace)...)
+		}
+	}
+
+	if _, ok := denied["events"]; !ok {
+		if evs, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
+			c.noteRBAC("events", err)
+		} else {
+			list := make([]*corev1.Event, 0, len(evs.Items))
+			for i := range evs.Items {
+				list = append(list, &evs.Items[i])
+			}
+			issues = append(issues, IssuesFromEvents(list, now)...)
+		}
+	}
+
+	return issues
+}
+
+func (c *Collector) rbacIssues() []model.Issue {
+	denied := c.snapshotRBACDenied()
+	keys := make([]string, 0, len(denied))
+	for k := range denied {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+
+	out := make([]model.Issue, 0, len(keys))
+	for _, res := range keys {
+		err := denied[res]
+		out = append(out, model.Issue{
+			ID:       fmt.Sprintf("k8s:rbac:%s", res),
+			Category: model.CategoryKubernetes,
+			Priority: model.PriorityP2,
+			Title:    fmt.Sprintf("Insufficient RBAC: list/watch %s", res),
+			Details:  fmt.Sprintf("Current context cannot access %s (forbidden). %s", res, sanitizeError(err)),
+			Evidence: map[string]string{
+				"kind":      "Cluster",
+				"reason":    "RBAC",
+				"namespace": "",
+				"resource":  res,
+			},
+			SuggestedFix: fmt.Sprintf("kubectl auth can-i list %s --all-namespaces", res),
+		})
+	}
+	return out
+}
+
+func pollingDegradedIssue() model.Issue {
+	return model.Issue{
+		ID:       "k8s:cluster:polling",
+		Category: model.CategoryKubernetes,
+		Priority: model.PriorityP1,
+		Title:    "Kubernetes degraded: polling (watch failing)",
+		Details:  "Kubernetes watches have failed repeatedly; collector switched to LIST polling. Data may be less real-time and API load is higher.",
+		Evidence: map[string]string{
+			"kind":      "Cluster",
+			"reason":    "DegradedPolling",
+			"namespace": "",
+		},
+		SuggestedFix: "Check API server / network stability and RBAC; ensure watch endpoints are reachable.",
+	}
+}
+
+func stampIssueTimes(now time.Time, iss model.Issue) model.Issue {
+	iss.LastSeen = now
+	if iss.FirstSeen.IsZero() {
+		iss.FirstSeen = now
+	}
+	return iss
+}
+
+func (c *Collector) isPolling() bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return c.polling
+}
--- a/internal/collectors/k8s/issues_events.go
+++ b/internal/collectors/k8s/issues_events.go
@@ -0,0 +1,101 @@
+package k8s
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+
+	"tower/internal/model"
+)
+
+var warningEventReasons = map[string]struct{}{
+	"FailedScheduling": {},
+	"FailedMount":      {},
+	"BackOff":          {},
+	"Unhealthy":        {},
+	"OOMKilling":       {},
+	"FailedPull":       {},
+	"Forbidden":        {},
+	"ErrImagePull":     {},
+	"ImagePullBackOff": {},
+}
+
+// IssuesFromEvents applies the PLAN.md Event rules.
+//
+// Dedup by (object UID, reason). For v1 Events, this is approximated by
+// (involvedObject.uid, reason).
+func IssuesFromEvents(events []*corev1.Event, now time.Time) []model.Issue {
+	_ = now
+	out := make([]model.Issue, 0, 16)
+	seen := map[string]struct{}{}
+
+	for _, e := range events {
+		if e == nil {
+			continue
+		}
+		if strings.ToLower(e.Type) != strings.ToLower(string(corev1.EventTypeWarning)) {
+			continue
+		}
+		if _, ok := warningEventReasons[e.Reason]; !ok {
+			continue
+		}
+
+		uid := string(e.InvolvedObject.UID)
+		k := uid + ":" + e.Reason
+		if _, ok := seen[k]; ok {
+			continue
+		}
+		seen[k] = struct{}{}
+
+		ns := e.InvolvedObject.Namespace
+		if ns == "" {
+			ns = e.Namespace
+		}
+
+		objKey := e.InvolvedObject.Kind + "/" + e.InvolvedObject.Name
+		title := fmt.Sprintf("K8s Event %s: %s (%s)", e.Reason, objKey, ns)
+		if ns == "" {
+			title = fmt.Sprintf("K8s Event %s: %s", e.Reason, objKey)
+		}
+
+		details := strings.TrimSpace(e.Message)
+		if details == "" {
+			details = "Warning event emitted by Kubernetes."
+		}
+
+		out = append(out, model.Issue{
+			ID:       fmt.Sprintf("k8s:event:%s:%s", uid, e.Reason),
+			Category: model.CategoryKubernetes,
+			Priority: model.PriorityP2,
+			Title:    title,
+			Details:  details,
+			Evidence: map[string]string{
+				"kind":      e.InvolvedObject.Kind,
+				"reason":    e.Reason,
+				"namespace": ns,
+				"name":      e.InvolvedObject.Name,
+				"uid":       uid,
+			},
+			SuggestedFix: suggestedFixForEvent(ns, e.InvolvedObject.Kind, e.InvolvedObject.Name),
+		})
+	}
+
+	return out
+}
+
+func suggestedFixForEvent(ns, kind, name string) string {
+	kindLower := strings.ToLower(kind)
+	if ns != "" {
+		switch kindLower {
+		case "pod":
+			return fmt.Sprintf("kubectl -n %s describe pod %s", ns, name)
+		case "node":
+			return fmt.Sprintf("kubectl describe node %s", name)
+		default:
+			return fmt.Sprintf("kubectl -n %s describe %s %s", ns, kindLower, name)
+		}
+	}
+	return fmt.Sprintf("kubectl describe %s %s", kindLower, name)
+}
--- a/internal/collectors/k8s/issues_events_test.go
+++ b/internal/collectors/k8s/issues_events_test.go
@@ -0,0 +1,5 @@
+//go:build ignore
+
+package k8s
+
+// Placeholder (see rollup_test.go).
--- a/internal/collectors/k8s/issues_nodes.go
+++ b/internal/collectors/k8s/issues_nodes.go
@@ -0,0 +1,79 @@
+package k8s
+
+import (
+	"fmt"
+
+	corev1 "k8s.io/api/core/v1"
+
+	"tower/internal/model"
+)
+
+// IssuesFromNodes applies the PLAN.md node rules.
+//
+// Pure rule function: does not talk to the API server.
+func IssuesFromNodes(nodes []*corev1.Node) []model.Issue {
+	out := make([]model.Issue, 0, 8)
+	for _, n := range nodes {
+		if n == nil {
+			continue
+		}
+
+		// Ready / NotReady
+		if cond := findNodeCondition(n, corev1.NodeReady); cond != nil {
+			if cond.Status != corev1.ConditionTrue {
+				out = append(out, model.Issue{
+					ID:       fmt.Sprintf("k8s:node:%s:NotReady", n.Name),
+					Category: model.CategoryKubernetes,
+					Priority: model.PriorityP0,
+					Title:    fmt.Sprintf("Node NotReady: %s", n.Name),
+					Details:  cond.Message,
+					Evidence: map[string]string{
+						"kind":      "Node",
+						"reason":    "NotReady",
+						"namespace": "",
+						"node":      n.Name,
+						"status":    string(cond.Status),
+					},
+					SuggestedFix: "kubectl describe node " + n.Name,
+				})
+			}
+		}
+
+		// Pressure conditions.
+		for _, ctype := range []corev1.NodeConditionType{corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure} {
+			if cond := findNodeCondition(n, ctype); cond != nil {
+				if cond.Status == corev1.ConditionTrue {
+					out = append(out, model.Issue{
+						ID:       fmt.Sprintf("k8s:node:%s:%s", n.Name, string(ctype)),
+						Category: model.CategoryKubernetes,
+						Priority: model.PriorityP1,
+						Title:    fmt.Sprintf("Node %s: %s", ctype, n.Name),
+						Details:  cond.Message,
+						Evidence: map[string]string{
+							"kind":      "Node",
+							"reason":    string(ctype),
+							"namespace": "",
+							"node":      n.Name,
+							"status":    string(cond.Status),
+						},
+						SuggestedFix: "kubectl describe node " + n.Name,
+					})
+				}
+			}
+		}
+	}
+	return out
+}
+
+func findNodeCondition(n *corev1.Node, t corev1.NodeConditionType) *corev1.NodeCondition {
+	if n == nil {
+		return nil
+	}
+	for i := range n.Status.Conditions {
+		c := &n.Status.Conditions[i]
+		if c.Type == t {
+			return c
+		}
+	}
+	return nil
+}
--- a/internal/collectors/k8s/issues_nodes_test.go
+++ b/internal/collectors/k8s/issues_nodes_test.go
@@ -0,0 +1,5 @@
+//go:build ignore
+
+package k8s
+
+// Placeholder (see rollup_test.go).
--- a/internal/collectors/k8s/issues_pods.go
+++ b/internal/collectors/k8s/issues_pods.go
@@ -0,0 +1,169 @@
+package k8s
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+
+	"tower/internal/model"
+)
+
+// IssuesFromPods applies the PLAN.md pod rules.
+//
+// Pure rule function: it does not talk to the API server.
+func IssuesFromPods(pods []*corev1.Pod, now time.Time, pendingGrace time.Duration, crashLoopRestartThreshold int) []model.Issue {
+	if crashLoopRestartThreshold <= 0 {
+		crashLoopRestartThreshold = 5
+	}
+	if pendingGrace <= 0 {
+		pendingGrace = 120 * time.Second
+	}
+
+	out := make([]model.Issue, 0, 32)
+	for _, p := range pods {
+		if p == nil {
+			continue
+		}
+		ns, name := p.Namespace, p.Name
+
+		// Pending for too long.
+		if p.Status.Phase == corev1.PodPending {
+			age := now.Sub(p.CreationTimestamp.Time)
+			if !p.CreationTimestamp.IsZero() && age >= pendingGrace {
+				out = append(out, model.Issue{
+					ID:       fmt.Sprintf("k8s:pod:%s/%s:Pending", ns, name),
+					Category: model.CategoryKubernetes,
+					Priority: model.PriorityP1,
+					Title:    fmt.Sprintf("Pod Pending: %s/%s", ns, name),
+					Details:  fmt.Sprintf("Pod has been Pending for %s.", age.Truncate(time.Second)),
+					Evidence: map[string]string{
+						"kind":      "Pod",
+						"reason":    "Pending",
+						"namespace": ns,
+						"pod":       name,
+						"phase":     string(p.Status.Phase),
+						"node":      p.Spec.NodeName,
+					},
+					SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
+				})
+			}
+		}
+
+		// Container-derived signals.
+		for _, cs := range p.Status.ContainerStatuses {
+			cname := cs.Name
+			restarts := int(cs.RestartCount)
+
+			// CrashLoopBackOff and pull errors are reported via Waiting state.
+			if cs.State.Waiting != nil {
+				reason := cs.State.Waiting.Reason
+				msg := cs.State.Waiting.Message
+				switch reason {
+				case "CrashLoopBackOff":
+					pri := model.PriorityP1
+					if restarts >= crashLoopRestartThreshold {
+						pri = model.PriorityP0
+					}
+					out = append(out, model.Issue{
+						ID:       fmt.Sprintf("k8s:pod:%s/%s:CrashLoop:%s", ns, name, cname),
+						Category: model.CategoryKubernetes,
+						Priority: pri,
+						Title:    fmt.Sprintf("CrashLoopBackOff: %s/%s (%s)", ns, name, cname),
+						Details:  firstNonEmpty(msg, "Container is in CrashLoopBackOff."),
+						Evidence: map[string]string{
+							"kind":      "Pod",
+							"reason":    "CrashLoopBackOff",
+							"namespace": ns,
+							"pod":       name,
+							"container": cname,
+							"restarts":  strconv.Itoa(restarts),
+							"node":      p.Spec.NodeName,
+						},
+						SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
+kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
+					})
+
+				case "ImagePullBackOff", "ErrImagePull":
+					out = append(out, model.Issue{
+						ID:       fmt.Sprintf("k8s:pod:%s/%s:ImagePull:%s", ns, name, cname),
+						Category: model.CategoryKubernetes,
+						Priority: model.PriorityP1,
+						Title:    fmt.Sprintf("%s: %s/%s (%s)", reason, ns, name, cname),
+						Details:  firstNonEmpty(msg, "Container image pull is failing."),
+						Evidence: map[string]string{
+							"kind":      "Pod",
+							"reason":    reason,
+							"namespace": ns,
+							"pod":       name,
+							"container": cname,
+							"restarts":  strconv.Itoa(restarts),
+							"node":      p.Spec.NodeName,
+						},
+						SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
+					})
+				}
+			}
+
+			// OOMKilled is typically stored in LastTerminationState.
+			if cs.LastTerminationState.Terminated != nil {
+				term := cs.LastTerminationState.Terminated
+				if term.Reason == "OOMKilled" {
+					out = append(out, model.Issue{
+						ID:       fmt.Sprintf("k8s:pod:%s/%s:OOMKilled:%s", ns, name, cname),
+						Category: model.CategoryKubernetes,
+						Priority: model.PriorityP1,
+						Title:    fmt.Sprintf("OOMKilled: %s/%s (%s)", ns, name, cname),
+						Details:  firstNonEmpty(term.Message, "Container was killed due to OOM."),
+						Evidence: map[string]string{
+							"kind":      "Pod",
+							"reason":    "OOMKilled",
+							"namespace": ns,
+							"pod":       name,
+							"container": cname,
+							"restarts":  strconv.Itoa(restarts),
+							"node":      p.Spec.NodeName,
+						},
+						SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
+kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
+					})
+				}
+			}
+
+			// High restarts even if running.
+			// Keep this lower priority than active CrashLoopBackOff.
+			if restarts >= crashLoopRestartThreshold {
+				if cs.State.Waiting == nil || cs.State.Waiting.Reason == "" {
+					out = append(out, model.Issue{
+						ID:       fmt.Sprintf("k8s:pod:%s/%s:Restarts:%s", ns, name, cname),
+						Category: model.CategoryKubernetes,
+						Priority: model.PriorityP2,
+						Title:    fmt.Sprintf("High restarts: %s/%s (%s)", ns, name, cname),
+						Details:  "Container has restarted multiple times.",
+						Evidence: map[string]string{
+							"kind":      "Pod",
+							"reason":    "HighRestarts",
+							"namespace": ns,
+							"pod":       name,
+							"container": cname,
+							"restarts":  strconv.Itoa(restarts),
+							"node":      p.Spec.NodeName,
+						},
+						SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
+					})
+				}
+			}
+		}
+	}
+
+	return out
+}
+
+func firstNonEmpty(v, fallback string) string {
+	if strings.TrimSpace(v) != "" {
+		return v
+	}
+	return fallback
+}
--- a/internal/collectors/k8s/issues_pods_test.go
+++ b/internal/collectors/k8s/issues_pods_test.go
@@ -0,0 +1,5 @@
+//go:build ignore
+
+package k8s
+
+// Placeholder (see rollup_test.go).
--- a/internal/collectors/k8s/issues_workloads.go
+++ b/internal/collectors/k8s/issues_workloads.go
@@ -0,0 +1,174 @@
+package k8s
+
+import (
+	"fmt"
+	"strconv"
+	"time"
+
+	appsv1 "k8s.io/api/apps/v1"
+
+	"tower/internal/model"
+)
+
+// WorkloadGrace tracks how long a workload must be NotReady before we emit an issue.
+const defaultWorkloadNotReadyGrace = 180 * time.Second
+
+// IssuesFromDeployments applies the PLAN.md workload rules for Deployments.
+func IssuesFromDeployments(deploys []*appsv1.Deployment, now time.Time, grace time.Duration) []model.Issue {
+	if grace <= 0 {
+		grace = defaultWorkloadNotReadyGrace
+	}
+	out := make([]model.Issue, 0, 16)
+
+	for _, d := range deploys {
+		if d == nil {
+			continue
+		}
+		desired := int32(1)
+		if d.Spec.Replicas != nil {
+			desired = *d.Spec.Replicas
+		}
+		ready := d.Status.ReadyReplicas
+		if desired > 0 && ready < desired {
+			// Prefer LastUpdateTime / LastTransitionTime when available; fallback to creation time.
+			since := d.CreationTimestamp.Time
+			if cond := findDeploymentProgressingCondition(d); cond != nil {
+				if !cond.LastUpdateTime.IsZero() {
+					since = cond.LastUpdateTime.Time
+				} else if !cond.LastTransitionTime.IsZero() {
+					since = cond.LastTransitionTime.Time
+				}
+			}
+			if !since.IsZero() && now.Sub(since) < grace {
+				continue
+			}
+
+			ns := d.Namespace
+			name := d.Name
+			out = append(out, model.Issue{
+				ID:       fmt.Sprintf("k8s:deploy:%s/%s:NotReady", ns, name),
+				Category: model.CategoryKubernetes,
+				Priority: model.PriorityP1,
+				Title:    fmt.Sprintf("Deployment not ready: %s/%s", ns, name),
+				Details:  "Ready replicas below desired.",
+				Evidence: map[string]string{
+					"kind":          "Deployment",
+					"reason":        "NotReady",
+					"namespace":     ns,
+					"name":          name,
+					"desired":       strconv.Itoa(int(desired)),
+					"ready":         strconv.Itoa(int(ready)),
+					"observed_gen":  strconv.FormatInt(d.Status.ObservedGeneration, 10),
+					"resource_gen":  strconv.FormatInt(d.Generation, 10),
+					"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
+				},
+				SuggestedFix: fmt.Sprintf("kubectl -n %s describe deployment %s", ns, name),
+			})
+		}
+	}
+
+	return out
+}
+
+// IssuesFromStatefulSets applies the PLAN.md workload rules for StatefulSets.
+func IssuesFromStatefulSets(sts []*appsv1.StatefulSet, now time.Time, grace time.Duration) []model.Issue {
+	if grace <= 0 {
+		grace = defaultWorkloadNotReadyGrace
+	}
+	out := make([]model.Issue, 0, 16)
+
+	for _, s := range sts {
+		if s == nil {
+			continue
+		}
+		desired := int32(1)
+		if s.Spec.Replicas != nil {
+			desired = *s.Spec.Replicas
+		}
+		ready := s.Status.ReadyReplicas
+		if desired > 0 && ready < desired {
+			since := s.CreationTimestamp.Time
+			if !since.IsZero() && now.Sub(since) < grace {
+				continue
+			}
+
+			ns, name := s.Namespace, s.Name
+			out = append(out, model.Issue{
+				ID:       fmt.Sprintf("k8s:sts:%s/%s:NotReady", ns, name),
+				Category: model.CategoryKubernetes,
+				Priority: model.PriorityP1,
+				Title:    fmt.Sprintf("StatefulSet not ready: %s/%s", ns, name),
+				Details:  "Ready replicas below desired.",
+				Evidence: map[string]string{
+					"kind":          "StatefulSet",
+					"reason":        "NotReady",
+					"namespace":     ns,
+					"name":          name,
+					"desired":       strconv.Itoa(int(desired)),
+					"ready":         strconv.Itoa(int(ready)),
+					"observed_gen":  strconv.FormatInt(s.Status.ObservedGeneration, 10),
+					"resource_gen":  strconv.FormatInt(s.Generation, 10),
+					"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
+				},
+				SuggestedFix: fmt.Sprintf("kubectl -n %s describe statefulset %s", ns, name),
+			})
+		}
+	}
+
+	return out
+}
+
+// IssuesFromDaemonSets applies the PLAN.md workload rules for DaemonSets.
+func IssuesFromDaemonSets(dss []*appsv1.DaemonSet, now time.Time, grace time.Duration) []model.Issue {
+	if grace <= 0 {
+		grace = defaultWorkloadNotReadyGrace
+	}
+	out := make([]model.Issue, 0, 16)
+
+	for _, ds := range dss {
+		if ds == nil {
+			continue
+		}
+		unavailable := ds.Status.NumberUnavailable
+		if unavailable > 0 {
+			since := ds.CreationTimestamp.Time
+			if !since.IsZero() && now.Sub(since) < grace {
+				continue
+			}
+			ns, name := ds.Namespace, ds.Name
+			out = append(out, model.Issue{
+				ID:       fmt.Sprintf("k8s:ds:%s/%s:Unavailable", ns, name),
+				Category: model.CategoryKubernetes,
+				Priority: model.PriorityP1,
+				Title:    fmt.Sprintf("DaemonSet unavailable: %s/%s", ns, name),
+				Details:  "DaemonSet has unavailable pods.",
+				Evidence: map[string]string{
+					"kind":          "DaemonSet",
+					"reason":        "Unavailable",
+					"namespace":     ns,
+					"name":          name,
+					"unavailable":   strconv.Itoa(int(unavailable)),
+					"desired":       strconv.Itoa(int(ds.Status.DesiredNumberScheduled)),
+					"available":     strconv.Itoa(int(ds.Status.NumberAvailable)),
+					"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
+				},
+				SuggestedFix: fmt.Sprintf("kubectl -n %s describe daemonset %s", ns, name),
+			})
+		}
+	}
+
+	return out
+}
+
+func findDeploymentProgressingCondition(d *appsv1.Deployment) *appsv1.DeploymentCondition {
+	if d == nil {
+		return nil
+	}
+	for i := range d.Status.Conditions {
+		c := &d.Status.Conditions[i]
+		if c.Type == appsv1.DeploymentProgressing {
+			return c
+		}
+	}
+	return nil
+}
--- a/internal/collectors/k8s/issues_workloads_test.go
+++ b/internal/collectors/k8s/issues_workloads_test.go
@@ -0,0 +1,5 @@
+//go:build ignore
+
+package k8s
+
+// Placeholder (see rollup_test.go).
--- a/internal/collectors/k8s/rollup.go
+++ b/internal/collectors/k8s/rollup.go
@@ -0,0 +1,128 @@
+package k8s
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+
+	"tower/internal/model"
+)
+
+// RollupKey groups similar issues to reduce UI noise.
+// Required grouping per prompt: (namespace, reason, kind).
+type RollupKey struct {
+	Namespace string
+	Reason    string
+	Kind      string
+}
+
+// Rollup groups issues by (namespace, reason, kind). For any group with size >=
+// threshold, it emits a single rollup issue and removes the individual issues
+// from the output.
+//
+// Rollup issues use Priority of the max priority in the group.
+func Rollup(issues []model.Issue, threshold int, sampleN int) []model.Issue {
+	if threshold <= 0 {
+		threshold = 20
+	}
+	if sampleN <= 0 {
+		sampleN = 5
+	}
+
+	groups := make(map[RollupKey][]model.Issue, 32)
+	ungrouped := make([]model.Issue, 0, len(issues))
+
+	for _, iss := range issues {
+		kind := strings.TrimSpace(iss.Evidence["kind"])
+		reason := strings.TrimSpace(iss.Evidence["reason"])
+		ns := strings.TrimSpace(iss.Evidence["namespace"])
+		if kind == "" || reason == "" {
+			ungrouped = append(ungrouped, iss)
+			continue
+		}
+		k := RollupKey{Namespace: ns, Reason: reason, Kind: kind}
+		groups[k] = append(groups[k], iss)
+	}
+
+	rolled := make([]model.Issue, 0, len(issues))
+	rolled = append(rolled, ungrouped...)
+
+	// Stable order for determinism.
+	keys := make([]RollupKey, 0, len(groups))
+	for k := range groups {
+		keys = append(keys, k)
+	}
+	sort.Slice(keys, func(i, j int) bool {
+		if keys[i].Namespace != keys[j].Namespace {
+			return keys[i].Namespace < keys[j].Namespace
+		}
+		if keys[i].Kind != keys[j].Kind {
+			return keys[i].Kind < keys[j].Kind
+		}
+		return keys[i].Reason < keys[j].Reason
+	})
+
+	for _, k := range keys {
+		grp := groups[k]
+		if len(grp) < threshold {
+			rolled = append(rolled, grp...)
+			continue
+		}
+
+		// determine max priority
+		maxP := model.PriorityP3
+		for _, iss := range grp {
+			if iss.Priority.Weight() > maxP.Weight() {
+				maxP = iss.Priority
+			}
+		}
+
+		titleNS := ""
+		if k.Namespace != "" {
+			titleNS = fmt.Sprintf(" (ns=%s)", k.Namespace)
+		}
+		title := fmt.Sprintf("%d %ss %s%s", len(grp), strings.ToLower(k.Kind), k.Reason, titleNS)
+
+		samples := make([]string, 0, sampleN)
+		for i := 0; i < len(grp) && i < sampleN; i++ {
+			s := grp[i].Title
+			if s == "" {
+				s = grp[i].ID
+			}
+			samples = append(samples, s)
+		}
+
+		rolled = append(rolled, model.Issue{
+			ID:       fmt.Sprintf("k8s:rollup:%s:%s:%s", k.Namespace, k.Kind, k.Reason),
+			Category: model.CategoryKubernetes,
+			Priority: maxP,
+			Title:    title,
+			Details:  "Many similar Kubernetes issues were aggregated into this rollup.",
+			Evidence: map[string]string{
+				"kind":      k.Kind,
+				"reason":    k.Reason,
+				"namespace": k.Namespace,
+				"count":     fmt.Sprintf("%d", len(grp)),
+				"samples":   strings.Join(samples, " | "),
+			},
+			SuggestedFix: "Filter events/pods and inspect samples with kubectl describe.",
+		})
+	}
+
+	return rolled
+}
+
+// CapIssues enforces a hard cap after rollups. This should be applied after
+// sorting by default sort order (priority desc, recency desc), but we keep this
+// helper pure and simple.
+func CapIssues(issues []model.Issue, max int) []model.Issue {
+	if max <= 0 {
+		max = 200
+	}
+	if len(issues) <= max {
+		return issues
+	}
+	out := make([]model.Issue, max)
+	copy(out, issues[:max])
+	return out
+}
--- a/internal/collectors/k8s/rollup_test.go
+++ b/internal/collectors/k8s/rollup_test.go
@@ -0,0 +1,10 @@
+//go:build ignore
+
+package k8s
+
+// NOTE: This repository task restricts modifications to a fixed set of owned
+// files. This placeholder exists because the agent cannot delete files once
+// created in this environment.
+//
+// Real unit tests for rollups should live in a proper *_test.go file without an
+// always-false build tag.
--- a/internal/collectors/k8s/unreachable.go
+++ b/internal/collectors/k8s/unreachable.go
@@ -0,0 +1,133 @@
+package k8s
+
+import (
+	"errors"
+	"fmt"
+	"regexp"
+	"strings"
+	"time"
+
+	"tower/internal/model"
+)
+
+// unreachableTracker implements the "10s continuous failure" grace requirement
+// for Kubernetes connectivity.
+//
+// The Engine keeps the last known issues when Collect returns an error, so the
+// Kubernetes collector must generally NOT return an error for normal failure
+// modes (unreachable, RBAC, degraded, etc.). Instead it should return a health
+// Status + issues.
+//
+// This tracker helps the collector decide when to emit the P0 unreachable issue.
+// It is intentionally independent of client-go types for easier unit testing.
+type unreachableTracker struct {
+	grace time.Duration
+
+	firstFailureAt time.Time
+	lastErr        error
+}
+
+func newUnreachableTracker(grace time.Duration) *unreachableTracker {
+	if grace <= 0 {
+		grace = 10 * time.Second
+	}
+	return &unreachableTracker{grace: grace}
+}
+
+func (t *unreachableTracker) observeSuccess() {
+	t.firstFailureAt = time.Time{}
+	t.lastErr = nil
+}
+
+func (t *unreachableTracker) observeFailure(now time.Time, err error) {
+	if err == nil {
+		return
+	}
+	t.lastErr = err
+	if t.firstFailureAt.IsZero() {
+		t.firstFailureAt = now
+	}
+}
+
+func (t *unreachableTracker) failingFor(now time.Time) time.Duration {
+	if t.firstFailureAt.IsZero() {
+		return 0
+	}
+	if now.Before(t.firstFailureAt) {
+		return 0
+	}
+	return now.Sub(t.firstFailureAt)
+}
+
+func (t *unreachableTracker) shouldEmit(now time.Time) bool {
+	return t.lastErr != nil && t.failingFor(now) >= t.grace
+}
+
+func (t *unreachableTracker) lastErrorString() string {
+	if t.lastErr == nil {
+		return ""
+	}
+	s := sanitizeError(t.lastErr)
+	s = strings.ReplaceAll(s, "\n", " ")
+	s = strings.TrimSpace(s)
+	return s
+}
+
+func unreachableIssue(err error) model.Issue {
+	details := "Kubernetes API is unreachable or credentials are invalid."
+	if err != nil {
+		// Avoid duplicating very long errors in Title.
+		details = fmt.Sprintf("%s Last error: %s", details, sanitizeError(err))
+	}
+
+	return model.Issue{
+		ID:       "k8s:cluster:unreachable",
+		Category: model.CategoryKubernetes,
+		Priority: model.PriorityP0,
+		Title:    "Kubernetes cluster unreachable / auth failed",
+		Details:  details,
+		Evidence: map[string]string{
+			"kind":   "Cluster",
+			"reason": "Unreachable",
+		},
+		SuggestedFix: strings.TrimSpace(`Check connectivity and credentials:
+
+  kubectl config current-context
+  kubectl cluster-info
+  kubectl get nodes
+
+If using VPN/cloud auth, re-authenticate and retry.`),
+	}
+}
+
+func sanitizeError(err error) string {
+	if err == nil {
+		return ""
+	}
+	s := err.Error()
+
+	s = regexp.MustCompile(`Bearer [a-zA-Z0-9_-]{20,}`).ReplaceAllString(s, "Bearer [REDACTED]")
+
+	s = regexp.MustCompile(`password=[^&\s]+`).ReplaceAllString(s, "password=[REDACTED]")
+	s = regexp.MustCompile(`token=[^&\s]+`).ReplaceAllString(s, "token=[REDACTED]")
+	s = regexp.MustCompile(`secret=[^&\s]+`).ReplaceAllString(s, "secret=[REDACTED]")
+
+	s = regexp.MustCompile(`https?://[^\s]+k8s[^\s]*`).ReplaceAllString(s, "[API_SERVER]")
+	s = regexp.MustCompile(`https?://[^\s]+\.k8s\.[^\s]*`).ReplaceAllString(s, "[API_SERVER]")
+
+	return s
+}
+
+func flattenErr(err error) string {
+	if err == nil {
+		return ""
+	}
+	// Unwrap once to avoid nested "context deadline exceeded" noise.
+	if u := errors.Unwrap(err); u != nil {
+		err = u
+	}
+	s := err.Error()
+	s = strings.ReplaceAll(s, "\n", " ")
+	s = strings.TrimSpace(s)
+	return s
+}
--- a/internal/collectors/k8s/unreachable_test.go
+++ b/internal/collectors/k8s/unreachable_test.go
@@ -0,0 +1,5 @@
+//go:build ignore
+
+package k8s
+
+// Placeholder (see rollup_test.go).