feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host
systems. Features include:

Core features:
- Collector framework with concurrent scheduling
- Host collectors: disk, memory, load, network
- Kubernetes collectors: pods, nodes, workloads, events with informers
- Issue deduplication, state management, and resolve-after logic
- Bubble Tea TUI with table view, details pane, and filtering
- JSON export functionality

UX improvements:
- Help overlay with keybindings
- Priority/category filters with visual indicators
- Direct priority jump (0/1/2/3)
- Bulk acknowledge (Shift+A)
- Clipboard copy (y)
- Theme toggle (T)
- Age format toggle (d)
- Wide title toggle (t)
- Vi-style navigation (j/k)
- Home/End jump (g/G)
- Rollup drill-down in details

Robustness:
- Grace period for unreachable clusters
- Rollups for high-volume issues
- Flap suppression
- RBAC error handling

Files: All core application code with tests for host collectors,
engine, store, model, and export packages.
This commit is contained in:
OpenCode Test
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions

309
internal/engine/engine.go Normal file
View File

@@ -0,0 +1,309 @@
package engine
import (
"context"
"sync"
"time"
"tower/internal/collectors"
"tower/internal/model"
)
// IssueStore is the Engine's dependency on the issue store.
//
// The concrete implementation lives in internal/store. We depend on an interface
// here to keep the Engine testable.
//
// NOTE: The store is responsible for dedupe + lifecycle (resolve-after, ack, etc.).
// The Engine simply merges outputs from collectors and passes them into Upsert.
//
// Engine calls Snapshot() to publish UI snapshots.
//
// This interface must be satisfied by internal/store.IssueStore.
// (Do not add persistence here.)
type IssueStore interface {
Upsert(now time.Time, issues []model.Issue)
Snapshot(now time.Time) []model.Issue
}
// CollectorConfig wires a collector into the Engine.
// Timeout applies per Collect() invocation.
// Interval comes from the collector itself.
//
// If Timeout <= 0, no per-collector timeout is applied.
type CollectorConfig struct {
Collector collectors.Collector
Timeout time.Duration
}
// CollectorHealth tracks the current health of a collector.
//
// Status is the last status returned by the collector.
// LastError is the last error returned by the collector (if any).
type CollectorHealth struct {
Status collectors.Status
LastError error
LastRun time.Time
LastOK time.Time
LastRunDur time.Duration
}
// Snapshot is the Engine's UI-facing view.
//
// Issues are sorted using the default sort order (Priority desc, then recency desc).
// Collectors is keyed by collector name.
type Snapshot struct {
At time.Time
Issues []model.Issue
Collectors map[string]CollectorHealth
}
type collectResult struct {
name string
at time.Time
duration time.Duration
issues []model.Issue
status collectors.Status
err error
}
type collectorRunner struct {
cfg CollectorConfig
refreshCh chan struct{}
}
// Engine runs collectors on their own schedules, merges issues, and updates the store.
// It publishes snapshots for the UI.
//
// Lifecycle:
//
// e := New(...)
// e.Start(ctx)
// defer e.Stop()
//
// Snapshots are emitted:
// - after any store update (collector completion)
// - periodically at refreshInterval (if > 0)
//
// RefreshNow() forces all collectors to run immediately.
type Engine struct {
store IssueStore
refreshInterval time.Duration
snapshots chan Snapshot
results chan collectResult
mu sync.Mutex
latestIssuesByCollector map[string][]model.Issue
health map[string]CollectorHealth
collectors []collectorRunner
cancel context.CancelFunc
wg sync.WaitGroup
startOnce sync.Once
stopOnce sync.Once
}
// New constructs an Engine.
//
// refreshInterval governs periodic snapshot emission. If refreshInterval <= 0,
// snapshots are only emitted when collectors finish.
func New(st IssueStore, cs []CollectorConfig, refreshInterval time.Duration) *Engine {
runners := make([]collectorRunner, 0, len(cs))
for _, c := range cs {
runners = append(runners, collectorRunner{
cfg: c,
refreshCh: make(chan struct{}, 1),
})
}
return &Engine{
store: st,
refreshInterval: refreshInterval,
snapshots: make(chan Snapshot, 32),
results: make(chan collectResult, 64),
latestIssuesByCollector: map[string][]model.Issue{},
health: map[string]CollectorHealth{},
collectors: runners,
}
}
// Start begins background collection. It is safe to call Start once.
func (e *Engine) Start(parent context.Context) {
e.startOnce.Do(func() {
ctx, cancel := context.WithCancel(parent)
e.cancel = cancel
e.wg.Add(1)
go func() {
defer e.wg.Done()
e.runAggregator(ctx)
}()
for i := range e.collectors {
r := &e.collectors[i]
e.wg.Add(1)
go func(r *collectorRunner) {
defer e.wg.Done()
e.runCollector(ctx, r)
}(r)
}
})
}
// Stop stops the Engine and closes the snapshots channel.
func (e *Engine) Stop() {
e.stopOnce.Do(func() {
if e.cancel != nil {
e.cancel()
}
e.wg.Wait()
close(e.snapshots)
})
}
// Snapshots returns a receive-only channel of snapshots.
func (e *Engine) Snapshots() <-chan Snapshot { return e.snapshots }
// RefreshNow forces all collectors to run immediately.
//
// This is non-blocking; if a collector already has a refresh queued, it will not
// queue additional refresh signals.
func (e *Engine) RefreshNow() {
for i := range e.collectors {
ch := e.collectors[i].refreshCh
select {
case ch <- struct{}{}:
default:
}
}
}
func (e *Engine) runCollector(ctx context.Context, r *collectorRunner) {
name := r.cfg.Collector.Name()
interval := r.cfg.Collector.Interval()
if interval <= 0 {
interval = time.Second
}
doCollect := func() {
start := time.Now()
collectCtx := ctx
cancel := func() {}
if r.cfg.Timeout > 0 {
collectCtx, cancel = context.WithTimeout(ctx, r.cfg.Timeout)
}
defer cancel()
issues, st, err := r.cfg.Collector.Collect(collectCtx)
finish := time.Now()
dur := finish.Sub(start)
// Copy issues slice to avoid data races when collectors reuse underlying storage.
copied := make([]model.Issue, len(issues))
copy(copied, issues)
res := collectResult{
name: name,
at: finish,
duration: dur,
issues: copied,
status: st,
err: err,
}
select {
case e.results <- res:
case <-ctx.Done():
return
}
}
// Collect immediately on start so the UI isn't empty for the first interval.
doCollect()
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
doCollect()
case <-r.refreshCh:
doCollect()
}
}
}
func (e *Engine) runAggregator(ctx context.Context) {
var ticker *time.Ticker
var tick <-chan time.Time
if e.refreshInterval > 0 {
ticker = time.NewTicker(e.refreshInterval)
defer ticker.Stop()
tick = ticker.C
}
emitSnapshot := func(at time.Time) {
issues := e.store.Snapshot(at)
// Ensure deterministic default sort for the UI.
model.SortIssuesDefault(issues)
// Copy collector health map.
e.mu.Lock()
h := make(map[string]CollectorHealth, len(e.health))
for k, v := range e.health {
h[k] = v
}
e.mu.Unlock()
snap := Snapshot{At: at, Issues: issues, Collectors: h}
// Non-blocking publish; drop if UI is behind.
select {
case e.snapshots <- snap:
default:
}
}
for {
select {
case <-ctx.Done():
return
case <-tick:
emitSnapshot(time.Now())
case res := <-e.results:
e.mu.Lock()
// On collector errors, keep the last known issues for that collector.
// This prevents transient errors/timeouts from making issues disappear.
if res.err == nil {
e.latestIssuesByCollector[res.name] = res.issues
}
ch := e.health[res.name]
ch.Status = res.status
ch.LastRun = res.at
ch.LastRunDur = res.duration
ch.LastError = res.err
if res.err == nil {
ch.LastOK = res.at
}
e.health[res.name] = ch
merged := make([]model.Issue, 0, 64)
for _, issues := range e.latestIssuesByCollector {
merged = append(merged, issues...)
}
e.mu.Unlock()
e.store.Upsert(res.at, merged)
emitSnapshot(res.at)
}
}
}

View File

@@ -0,0 +1,225 @@
package engine
import (
"context"
"errors"
"sync"
"sync/atomic"
"testing"
"time"
"tower/internal/collectors"
"tower/internal/model"
)
type fakeStore struct {
mu sync.Mutex
upsertCalls int
lastNow time.Time
lastIssues []model.Issue
}
func (s *fakeStore) Upsert(now time.Time, issues []model.Issue) {
s.mu.Lock()
defer s.mu.Unlock()
s.upsertCalls++
s.lastNow = now
// Deep-ish copy: slice copy is enough for our tests.
s.lastIssues = append([]model.Issue(nil), issues...)
}
func (s *fakeStore) Snapshot(now time.Time) []model.Issue {
s.mu.Lock()
defer s.mu.Unlock()
return append([]model.Issue(nil), s.lastIssues...)
}
func (s *fakeStore) UpsertCount() int {
s.mu.Lock()
defer s.mu.Unlock()
return s.upsertCalls
}
type fakeCollector struct {
name string
interval time.Duration
// delay simulates work. If ctx is canceled/timeout hits, Collect returns ctx.Err().
delay time.Duration
issuesFn func(call int64) []model.Issue
calls atomic.Int64
callCh chan time.Time
}
func (c *fakeCollector) Name() string { return c.name }
func (c *fakeCollector) Interval() time.Duration {
return c.interval
}
func (c *fakeCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
call := c.calls.Add(1)
if c.callCh != nil {
select {
case c.callCh <- time.Now():
default:
}
}
if c.delay > 0 {
t := time.NewTimer(c.delay)
defer t.Stop()
select {
case <-ctx.Done():
var st collectors.Status
return nil, st, ctx.Err()
case <-t.C:
}
}
var st collectors.Status
if c.issuesFn != nil {
return c.issuesFn(call), st, nil
}
return nil, st, nil
}
func recvSnapshot(t *testing.T, ch <-chan Snapshot, within time.Duration) Snapshot {
t.Helper()
select {
case s := <-ch:
return s
case <-time.After(within):
t.Fatalf("timed out waiting for snapshot")
return Snapshot{}
}
}
func TestEngine_UpsertAndSnapshotsEmitted(t *testing.T) {
st := &fakeStore{}
c := &fakeCollector{
name: "c1",
interval: 100 * time.Millisecond,
issuesFn: func(call int64) []model.Issue {
return []model.Issue{{
ID: "id-1",
Priority: model.PriorityP1,
Title: "hello",
LastSeen: time.Now(),
}}
},
}
e := New(st, []CollectorConfig{{Collector: c, Timeout: 200 * time.Millisecond}}, 0)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e.Stop()
e.Start(ctx)
snap := recvSnapshot(t, e.Snapshots(), 300*time.Millisecond)
if st.UpsertCount() < 1 {
t.Fatalf("expected store.Upsert to be called")
}
if len(snap.Issues) != 1 || snap.Issues[0].ID != "id-1" {
t.Fatalf("expected snapshot to contain issue id-1; got %+v", snap.Issues)
}
if _, ok := snap.Collectors["c1"]; !ok {
t.Fatalf("expected collector health entry for c1")
}
}
func TestEngine_CollectorTimeoutCancelsLongCollect(t *testing.T) {
st := &fakeStore{}
c := &fakeCollector{
name: "slow",
interval: time.Hour,
delay: 200 * time.Millisecond,
}
e := New(st, []CollectorConfig{{Collector: c, Timeout: 20 * time.Millisecond}}, 0)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e.Stop()
e.Start(ctx)
snap := recvSnapshot(t, e.Snapshots(), 400*time.Millisecond)
ch, ok := snap.Collectors["slow"]
if !ok {
t.Fatalf("expected collector health entry for slow")
}
if ch.LastError == nil {
t.Fatalf("expected LastError to be set")
}
if !errors.Is(ch.LastError, context.DeadlineExceeded) {
t.Fatalf("expected context deadline exceeded; got %v", ch.LastError)
}
if st.UpsertCount() < 1 {
t.Fatalf("expected store.Upsert to be called")
}
}
func TestEngine_RefreshNowTriggersImmediateCollect(t *testing.T) {
st := &fakeStore{}
callCh := make(chan time.Time, 10)
c := &fakeCollector{
name: "r",
interval: 200 * time.Millisecond,
callCh: callCh,
}
e := New(st, []CollectorConfig{{Collector: c, Timeout: time.Second}}, 0)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e.Stop()
e.Start(ctx)
// First collect happens immediately.
select {
case <-callCh:
case <-time.After(200 * time.Millisecond):
t.Fatalf("timed out waiting for initial collect")
}
// Trigger refresh; should happen well before the 200ms interval.
time.Sleep(10 * time.Millisecond)
e.RefreshNow()
select {
case <-callCh:
// ok
case <-time.After(120 * time.Millisecond):
t.Fatalf("expected RefreshNow to trigger a collect quickly")
}
}
func TestEngine_MultipleCollectorsRunOnIntervals(t *testing.T) {
st := &fakeStore{}
fast := &fakeCollector{name: "fast", interval: 30 * time.Millisecond}
slow := &fakeCollector{name: "slow", interval: 80 * time.Millisecond}
e := New(st, []CollectorConfig{{Collector: fast, Timeout: time.Second}, {Collector: slow, Timeout: time.Second}}, 0)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
e.Start(ctx)
// Let it run a bit.
time.Sleep(220 * time.Millisecond)
e.Stop()
fastCalls := fast.calls.Load()
slowCalls := slow.calls.Load()
// Includes initial collect.
if fastCalls < 4 {
t.Fatalf("expected fast collector to be called multiple times; got %d", fastCalls)
}
if slowCalls < 2 {
t.Fatalf("expected slow collector to be called multiple times; got %d", slowCalls)
}
}