feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions
--- a/internal/engine/engine.go
+++ b/internal/engine/engine.go
@@ -0,0 +1,309 @@
+package engine
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+// IssueStore is the Engine's dependency on the issue store.
+//
+// The concrete implementation lives in internal/store. We depend on an interface
+// here to keep the Engine testable.
+//
+// NOTE: The store is responsible for dedupe + lifecycle (resolve-after, ack, etc.).
+// The Engine simply merges outputs from collectors and passes them into Upsert.
+//
+// Engine calls Snapshot() to publish UI snapshots.
+//
+// This interface must be satisfied by internal/store.IssueStore.
+// (Do not add persistence here.)
+type IssueStore interface {
+	Upsert(now time.Time, issues []model.Issue)
+	Snapshot(now time.Time) []model.Issue
+}
+
+// CollectorConfig wires a collector into the Engine.
+// Timeout applies per Collect() invocation.
+// Interval comes from the collector itself.
+//
+// If Timeout <= 0, no per-collector timeout is applied.
+type CollectorConfig struct {
+	Collector collectors.Collector
+	Timeout   time.Duration
+}
+
+// CollectorHealth tracks the current health of a collector.
+//
+// Status is the last status returned by the collector.
+// LastError is the last error returned by the collector (if any).
+type CollectorHealth struct {
+	Status     collectors.Status
+	LastError  error
+	LastRun    time.Time
+	LastOK     time.Time
+	LastRunDur time.Duration
+}
+
+// Snapshot is the Engine's UI-facing view.
+//
+// Issues are sorted using the default sort order (Priority desc, then recency desc).
+// Collectors is keyed by collector name.
+type Snapshot struct {
+	At         time.Time
+	Issues     []model.Issue
+	Collectors map[string]CollectorHealth
+}
+
+type collectResult struct {
+	name     string
+	at       time.Time
+	duration time.Duration
+	issues   []model.Issue
+	status   collectors.Status
+	err      error
+}
+
+type collectorRunner struct {
+	cfg       CollectorConfig
+	refreshCh chan struct{}
+}
+
+// Engine runs collectors on their own schedules, merges issues, and updates the store.
+// It publishes snapshots for the UI.
+//
+// Lifecycle:
+//
+//	e := New(...)
+//	e.Start(ctx)
+//	defer e.Stop()
+//
+// Snapshots are emitted:
+//   - after any store update (collector completion)
+//   - periodically at refreshInterval (if > 0)
+//
+// RefreshNow() forces all collectors to run immediately.
+type Engine struct {
+	store           IssueStore
+	refreshInterval time.Duration
+
+	snapshots chan Snapshot
+	results   chan collectResult
+
+	mu                      sync.Mutex
+	latestIssuesByCollector map[string][]model.Issue
+	health                  map[string]CollectorHealth
+
+	collectors []collectorRunner
+
+	cancel context.CancelFunc
+	wg     sync.WaitGroup
+
+	startOnce sync.Once
+	stopOnce  sync.Once
+}
+
+// New constructs an Engine.
+//
+// refreshInterval governs periodic snapshot emission. If refreshInterval <= 0,
+// snapshots are only emitted when collectors finish.
+func New(st IssueStore, cs []CollectorConfig, refreshInterval time.Duration) *Engine {
+	runners := make([]collectorRunner, 0, len(cs))
+	for _, c := range cs {
+		runners = append(runners, collectorRunner{
+			cfg:       c,
+			refreshCh: make(chan struct{}, 1),
+		})
+	}
+
+	return &Engine{
+		store:                   st,
+		refreshInterval:         refreshInterval,
+		snapshots:               make(chan Snapshot, 32),
+		results:                 make(chan collectResult, 64),
+		latestIssuesByCollector: map[string][]model.Issue{},
+		health:                  map[string]CollectorHealth{},
+		collectors:              runners,
+	}
+}
+
+// Start begins background collection. It is safe to call Start once.
+func (e *Engine) Start(parent context.Context) {
+	e.startOnce.Do(func() {
+		ctx, cancel := context.WithCancel(parent)
+		e.cancel = cancel
+
+		e.wg.Add(1)
+		go func() {
+			defer e.wg.Done()
+			e.runAggregator(ctx)
+		}()
+
+		for i := range e.collectors {
+			r := &e.collectors[i]
+			e.wg.Add(1)
+			go func(r *collectorRunner) {
+				defer e.wg.Done()
+				e.runCollector(ctx, r)
+			}(r)
+		}
+	})
+}
+
+// Stop stops the Engine and closes the snapshots channel.
+func (e *Engine) Stop() {
+	e.stopOnce.Do(func() {
+		if e.cancel != nil {
+			e.cancel()
+		}
+		e.wg.Wait()
+		close(e.snapshots)
+	})
+}
+
+// Snapshots returns a receive-only channel of snapshots.
+func (e *Engine) Snapshots() <-chan Snapshot { return e.snapshots }
+
+// RefreshNow forces all collectors to run immediately.
+//
+// This is non-blocking; if a collector already has a refresh queued, it will not
+// queue additional refresh signals.
+func (e *Engine) RefreshNow() {
+	for i := range e.collectors {
+		ch := e.collectors[i].refreshCh
+		select {
+		case ch <- struct{}{}:
+		default:
+		}
+	}
+}
+
+func (e *Engine) runCollector(ctx context.Context, r *collectorRunner) {
+	name := r.cfg.Collector.Name()
+	interval := r.cfg.Collector.Interval()
+	if interval <= 0 {
+		interval = time.Second
+	}
+
+	doCollect := func() {
+		start := time.Now()
+
+		collectCtx := ctx
+		cancel := func() {}
+		if r.cfg.Timeout > 0 {
+			collectCtx, cancel = context.WithTimeout(ctx, r.cfg.Timeout)
+		}
+		defer cancel()
+
+		issues, st, err := r.cfg.Collector.Collect(collectCtx)
+		finish := time.Now()
+		dur := finish.Sub(start)
+
+		// Copy issues slice to avoid data races when collectors reuse underlying storage.
+		copied := make([]model.Issue, len(issues))
+		copy(copied, issues)
+
+		res := collectResult{
+			name:     name,
+			at:       finish,
+			duration: dur,
+			issues:   copied,
+			status:   st,
+			err:      err,
+		}
+
+		select {
+		case e.results <- res:
+		case <-ctx.Done():
+			return
+		}
+	}
+
+	// Collect immediately on start so the UI isn't empty for the first interval.
+	doCollect()
+
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			doCollect()
+		case <-r.refreshCh:
+			doCollect()
+		}
+	}
+}
+
+func (e *Engine) runAggregator(ctx context.Context) {
+	var ticker *time.Ticker
+	var tick <-chan time.Time
+	if e.refreshInterval > 0 {
+		ticker = time.NewTicker(e.refreshInterval)
+		defer ticker.Stop()
+		tick = ticker.C
+	}
+
+	emitSnapshot := func(at time.Time) {
+		issues := e.store.Snapshot(at)
+		// Ensure deterministic default sort for the UI.
+		model.SortIssuesDefault(issues)
+
+		// Copy collector health map.
+		e.mu.Lock()
+		h := make(map[string]CollectorHealth, len(e.health))
+		for k, v := range e.health {
+			h[k] = v
+		}
+		e.mu.Unlock()
+
+		snap := Snapshot{At: at, Issues: issues, Collectors: h}
+		// Non-blocking publish; drop if UI is behind.
+		select {
+		case e.snapshots <- snap:
+		default:
+		}
+	}
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+
+		case <-tick:
+			emitSnapshot(time.Now())
+
+		case res := <-e.results:
+			e.mu.Lock()
+			// On collector errors, keep the last known issues for that collector.
+			// This prevents transient errors/timeouts from making issues disappear.
+			if res.err == nil {
+				e.latestIssuesByCollector[res.name] = res.issues
+			}
+
+			ch := e.health[res.name]
+			ch.Status = res.status
+			ch.LastRun = res.at
+			ch.LastRunDur = res.duration
+			ch.LastError = res.err
+			if res.err == nil {
+				ch.LastOK = res.at
+			}
+			e.health[res.name] = ch
+
+			merged := make([]model.Issue, 0, 64)
+			for _, issues := range e.latestIssuesByCollector {
+				merged = append(merged, issues...)
+			}
+			e.mu.Unlock()
+
+			e.store.Upsert(res.at, merged)
+			emitSnapshot(res.at)
+		}
+	}
+}
--- a/internal/engine/engine_test.go
+++ b/internal/engine/engine_test.go
@@ -0,0 +1,225 @@
+package engine
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+type fakeStore struct {
+	mu sync.Mutex
+
+	upsertCalls int
+	lastNow     time.Time
+	lastIssues  []model.Issue
+}
+
+func (s *fakeStore) Upsert(now time.Time, issues []model.Issue) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.upsertCalls++
+	s.lastNow = now
+	// Deep-ish copy: slice copy is enough for our tests.
+	s.lastIssues = append([]model.Issue(nil), issues...)
+}
+
+func (s *fakeStore) Snapshot(now time.Time) []model.Issue {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return append([]model.Issue(nil), s.lastIssues...)
+}
+
+func (s *fakeStore) UpsertCount() int {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return s.upsertCalls
+}
+
+type fakeCollector struct {
+	name     string
+	interval time.Duration
+
+	// delay simulates work. If ctx is canceled/timeout hits, Collect returns ctx.Err().
+	delay time.Duration
+
+	issuesFn func(call int64) []model.Issue
+
+	calls  atomic.Int64
+	callCh chan time.Time
+}
+
+func (c *fakeCollector) Name() string { return c.name }
+func (c *fakeCollector) Interval() time.Duration {
+	return c.interval
+}
+
+func (c *fakeCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
+	call := c.calls.Add(1)
+	if c.callCh != nil {
+		select {
+		case c.callCh <- time.Now():
+		default:
+		}
+	}
+
+	if c.delay > 0 {
+		t := time.NewTimer(c.delay)
+		defer t.Stop()
+		select {
+		case <-ctx.Done():
+			var st collectors.Status
+			return nil, st, ctx.Err()
+		case <-t.C:
+		}
+	}
+
+	var st collectors.Status
+	if c.issuesFn != nil {
+		return c.issuesFn(call), st, nil
+	}
+	return nil, st, nil
+}
+
+func recvSnapshot(t *testing.T, ch <-chan Snapshot, within time.Duration) Snapshot {
+	t.Helper()
+	select {
+	case s := <-ch:
+		return s
+	case <-time.After(within):
+		t.Fatalf("timed out waiting for snapshot")
+		return Snapshot{}
+	}
+}
+
+func TestEngine_UpsertAndSnapshotsEmitted(t *testing.T) {
+	st := &fakeStore{}
+	c := &fakeCollector{
+		name:     "c1",
+		interval: 100 * time.Millisecond,
+		issuesFn: func(call int64) []model.Issue {
+			return []model.Issue{{
+				ID:       "id-1",
+				Priority: model.PriorityP1,
+				Title:    "hello",
+				LastSeen: time.Now(),
+			}}
+		},
+	}
+
+	e := New(st, []CollectorConfig{{Collector: c, Timeout: 200 * time.Millisecond}}, 0)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	defer e.Stop()
+
+	e.Start(ctx)
+
+	snap := recvSnapshot(t, e.Snapshots(), 300*time.Millisecond)
+	if st.UpsertCount() < 1 {
+		t.Fatalf("expected store.Upsert to be called")
+	}
+	if len(snap.Issues) != 1 || snap.Issues[0].ID != "id-1" {
+		t.Fatalf("expected snapshot to contain issue id-1; got %+v", snap.Issues)
+	}
+	if _, ok := snap.Collectors["c1"]; !ok {
+		t.Fatalf("expected collector health entry for c1")
+	}
+}
+
+func TestEngine_CollectorTimeoutCancelsLongCollect(t *testing.T) {
+	st := &fakeStore{}
+	c := &fakeCollector{
+		name:     "slow",
+		interval: time.Hour,
+		delay:    200 * time.Millisecond,
+	}
+
+	e := New(st, []CollectorConfig{{Collector: c, Timeout: 20 * time.Millisecond}}, 0)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	defer e.Stop()
+
+	e.Start(ctx)
+
+	snap := recvSnapshot(t, e.Snapshots(), 400*time.Millisecond)
+	ch, ok := snap.Collectors["slow"]
+	if !ok {
+		t.Fatalf("expected collector health entry for slow")
+	}
+	if ch.LastError == nil {
+		t.Fatalf("expected LastError to be set")
+	}
+	if !errors.Is(ch.LastError, context.DeadlineExceeded) {
+		t.Fatalf("expected context deadline exceeded; got %v", ch.LastError)
+	}
+	if st.UpsertCount() < 1 {
+		t.Fatalf("expected store.Upsert to be called")
+	}
+}
+
+func TestEngine_RefreshNowTriggersImmediateCollect(t *testing.T) {
+	st := &fakeStore{}
+	callCh := make(chan time.Time, 10)
+	c := &fakeCollector{
+		name:     "r",
+		interval: 200 * time.Millisecond,
+		callCh:   callCh,
+	}
+
+	e := New(st, []CollectorConfig{{Collector: c, Timeout: time.Second}}, 0)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	defer e.Stop()
+
+	e.Start(ctx)
+
+	// First collect happens immediately.
+	select {
+	case <-callCh:
+	case <-time.After(200 * time.Millisecond):
+		t.Fatalf("timed out waiting for initial collect")
+	}
+
+	// Trigger refresh; should happen well before the 200ms interval.
+	time.Sleep(10 * time.Millisecond)
+	e.RefreshNow()
+
+	select {
+	case <-callCh:
+		// ok
+	case <-time.After(120 * time.Millisecond):
+		t.Fatalf("expected RefreshNow to trigger a collect quickly")
+	}
+}
+
+func TestEngine_MultipleCollectorsRunOnIntervals(t *testing.T) {
+	st := &fakeStore{}
+	fast := &fakeCollector{name: "fast", interval: 30 * time.Millisecond}
+	slow := &fakeCollector{name: "slow", interval: 80 * time.Millisecond}
+
+	e := New(st, []CollectorConfig{{Collector: fast, Timeout: time.Second}, {Collector: slow, Timeout: time.Second}}, 0)
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	e.Start(ctx)
+	// Let it run a bit.
+	time.Sleep(220 * time.Millisecond)
+	e.Stop()
+
+	fastCalls := fast.calls.Load()
+	slowCalls := slow.calls.Load()
+
+	// Includes initial collect.
+	if fastCalls < 4 {
+		t.Fatalf("expected fast collector to be called multiple times; got %d", fastCalls)
+	}
+	if slowCalls < 2 {
+		t.Fatalf("expected slow collector to be called multiple times; got %d", slowCalls)
+	}
+}