feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host
systems. Features include:

Core features:
- Collector framework with concurrent scheduling
- Host collectors: disk, memory, load, network
- Kubernetes collectors: pods, nodes, workloads, events with informers
- Issue deduplication, state management, and resolve-after logic
- Bubble Tea TUI with table view, details pane, and filtering
- JSON export functionality

UX improvements:
- Help overlay with keybindings
- Priority/category filters with visual indicators
- Direct priority jump (0/1/2/3)
- Bulk acknowledge (Shift+A)
- Clipboard copy (y)
- Theme toggle (T)
- Age format toggle (d)
- Wide title toggle (t)
- Vi-style navigation (j/k)
- Home/End jump (g/G)
- Rollup drill-down in details

Robustness:
- Grace period for unreachable clusters
- Rollups for high-volume issues
- Flap suppression
- RBAC error handling

Files: All core application code with tests for host collectors,
engine, store, model, and export packages.
This commit is contained in:
OpenCode Test
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions

182
internal/store/store.go Normal file
View File

@@ -0,0 +1,182 @@
package store
import (
"sync"
"time"
"tower/internal/model"
)
const defaultResolveAfter = 30 * time.Second
// Store is an in-memory IssueStore.
//
// Responsibilities (per PLAN.md):
// - Dedupe by Issue.ID
// - Track FirstSeen/LastSeen
// - Maintain State (Open/Acknowledged/Resolved)
// - Resolve issues only after resolveAfter duration of continuous absence
// - Acknowledgements are in-memory only (not persisted)
// - Safe for concurrent use
type Store struct {
mu sync.RWMutex
resolveAfter time.Duration
// issues holds the latest known version of each issue keyed by stable ID.
issues map[string]model.Issue
// ack is an in-memory toggle keyed by issue ID.
// If true and the issue is currently present, its state is Acknowledged.
ack map[string]bool
}
// New returns a new Store.
// If resolveAfter <= 0, a default of 30s is used.
func New(resolveAfter time.Duration) *Store {
if resolveAfter <= 0 {
resolveAfter = defaultResolveAfter
}
return &Store{
resolveAfter: resolveAfter,
issues: map[string]model.Issue{},
ack: map[string]bool{},
}
}
// Upsert merges "currently true" issues for this tick.
//
// Incoming is deduped by Issue.ID; the first instance wins for non-timestamp fields.
// Timestamps/state are managed by the store.
func (s *Store) Upsert(now time.Time, incoming []model.Issue) {
// Pre-dedupe without locking to keep lock hold times small.
seen := make(map[string]model.Issue, len(incoming))
for _, iss := range incoming {
if iss.ID == "" {
// Ignore invalid issues. ID is the stable dedupe key.
continue
}
if _, ok := seen[iss.ID]; ok {
continue
}
seen[iss.ID] = iss
}
s.mu.Lock()
defer s.mu.Unlock()
for id, in := range seen {
existing, ok := s.issues[id]
if !ok || existing.State == model.StateResolved {
// New issue (or a previously resolved one reappearing): start a new "episode".
in.FirstSeen = now
in.LastSeen = now
in.State = model.StateOpen
if s.ack[id] {
in.State = model.StateAcknowledged
}
s.issues[id] = in
continue
}
// Existing open/acked issue: update all fields from incoming, but preserve FirstSeen.
in.FirstSeen = existing.FirstSeen
in.LastSeen = now
in.State = model.StateOpen
if s.ack[id] {
in.State = model.StateAcknowledged
}
s.issues[id] = in
}
// Update resolved state for issues not present this tick.
s.applyResolutionsLocked(now, seen)
}
// Snapshot returns a point-in-time copy of all known issues with their states updated
// according to resolveAfter.
func (s *Store) Snapshot(now time.Time) []model.Issue {
s.mu.Lock()
defer s.mu.Unlock()
// Apply resolutions based on time. We don't know which IDs are present "this tick"
// from Snapshot alone, so we only resolve by absence window (LastSeen age).
s.applyResolutionsLocked(now, nil)
out := make([]model.Issue, 0, len(s.issues))
for _, iss := range s.issues {
out = append(out, deepCopyIssue(iss))
}
return out
}
// Acknowledge marks an issue acknowledged (in-memory only).
func (s *Store) Acknowledge(id string) {
if id == "" {
return
}
s.mu.Lock()
defer s.mu.Unlock()
s.ack[id] = true
iss, ok := s.issues[id]
if !ok {
return
}
if iss.State != model.StateResolved {
iss.State = model.StateAcknowledged
s.issues[id] = iss
}
}
// Unacknowledge clears the acknowledgement toggle (in-memory only).
func (s *Store) Unacknowledge(id string) {
if id == "" {
return
}
s.mu.Lock()
defer s.mu.Unlock()
delete(s.ack, id)
iss, ok := s.issues[id]
if !ok {
return
}
if iss.State != model.StateResolved {
iss.State = model.StateOpen
s.issues[id] = iss
}
}
func (s *Store) applyResolutionsLocked(now time.Time, present map[string]model.Issue) {
for id, iss := range s.issues {
// If caller provided a present set and the ID is present, it cannot be resolved.
if present != nil {
if _, ok := present[id]; ok {
continue
}
}
if iss.State == model.StateResolved {
continue
}
if s.resolveAfter > 0 && now.Sub(iss.LastSeen) >= s.resolveAfter {
iss.State = model.StateResolved
s.issues[id] = iss
}
}
}
func deepCopyIssue(in model.Issue) model.Issue {
out := in
if in.Evidence != nil {
m := make(map[string]string, len(in.Evidence))
for k, v := range in.Evidence {
m[k] = v
}
out.Evidence = m
}
return out
}

View File

@@ -0,0 +1,101 @@
package store
import (
"testing"
"time"
"tower/internal/model"
)
func TestStore_Upsert_DedupAndTimestamps(t *testing.T) {
now1 := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
now2 := now1.Add(5 * time.Second)
s := New(30 * time.Second)
// Same ID twice in one Upsert should dedupe.
s.Upsert(now1, []model.Issue{
{ID: "i-1", Title: "first"},
{ID: "i-1", Title: "should be ignored"},
})
snap1 := s.Snapshot(now1)
if len(snap1) != 1 {
t.Fatalf("expected 1 issue, got %d", len(snap1))
}
if snap1[0].ID != "i-1" {
t.Fatalf("expected id i-1, got %q", snap1[0].ID)
}
if !snap1[0].FirstSeen.Equal(now1) {
t.Fatalf("expected FirstSeen=%v, got %v", now1, snap1[0].FirstSeen)
}
if !snap1[0].LastSeen.Equal(now1) {
t.Fatalf("expected LastSeen=%v, got %v", now1, snap1[0].LastSeen)
}
if snap1[0].State != model.StateOpen {
t.Fatalf("expected State=Open, got %q", snap1[0].State)
}
// Subsequent Upsert for same ID should preserve FirstSeen and update LastSeen.
s.Upsert(now2, []model.Issue{{ID: "i-1", Title: "updated"}})
snap2 := s.Snapshot(now2)
if len(snap2) != 1 {
t.Fatalf("expected 1 issue, got %d", len(snap2))
}
if !snap2[0].FirstSeen.Equal(now1) {
t.Fatalf("expected FirstSeen to remain %v, got %v", now1, snap2[0].FirstSeen)
}
if !snap2[0].LastSeen.Equal(now2) {
t.Fatalf("expected LastSeen=%v, got %v", now2, snap2[0].LastSeen)
}
}
func TestStore_AckPreservedWhilePresent(t *testing.T) {
now1 := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
now2 := now1.Add(1 * time.Second)
s := New(30 * time.Second)
s.Upsert(now1, []model.Issue{{ID: "i-1", Title: "t"}})
s.Acknowledge("i-1")
// Upsert again while present should remain Acked.
s.Upsert(now2, []model.Issue{{ID: "i-1", Title: "t2"}})
snap := s.Snapshot(now2)
if len(snap) != 1 {
t.Fatalf("expected 1 issue, got %d", len(snap))
}
if snap[0].State != model.StateAcknowledged {
t.Fatalf("expected State=Acknowledged, got %q", snap[0].State)
}
s.Unacknowledge("i-1")
snap2 := s.Snapshot(now2)
if snap2[0].State != model.StateOpen {
t.Fatalf("expected State=Open after unack, got %q", snap2[0].State)
}
}
func TestStore_ResolvesOnlyAfterAbsenceWindow(t *testing.T) {
resolveAfter := 10 * time.Second
now0 := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
s := New(resolveAfter)
s.Upsert(now0, []model.Issue{{ID: "i-1", Title: "t"}})
// Miss a tick shortly after; should not resolve due to flap suppression / window.
s.Upsert(now0.Add(1*time.Second), nil)
snap1 := s.Snapshot(now0.Add(9 * time.Second))
if len(snap1) != 1 {
t.Fatalf("expected 1 issue, got %d", len(snap1))
}
if snap1[0].State != model.StateOpen {
t.Fatalf("expected still Open before resolveAfter, got %q", snap1[0].State)
}
// Still absent beyond resolveAfter => should resolve.
snap2 := s.Snapshot(now0.Add(11 * time.Second))
if snap2[0].State != model.StateResolved {
t.Fatalf("expected Resolved after absence > resolveAfter, got %q", snap2[0].State)
}
}