feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions
@@ -0,0 +1,287 @@
+package host
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+// DiskCollector checks filesystem block + inode pressure across mounts.
+//
+// It reads /proc/mounts to discover mounts and then uses statfs to compute usage.
+// Pseudo filesystems are filtered out.
+//
+// Thresholds (PLAN.md):
+//   - P1 if blocks OR inodes >= 92%
+//   - P0 if blocks OR inodes >= 98%
+//
+// Issues are emitted per mount (one issue that includes both block+inode usage).
+//
+// NOTE: This collector is Linux-specific.
+type DiskCollector struct {
+	interval time.Duration
+
+	readFile func(string) ([]byte, error)
+	statfs   func(path string, st *syscall.Statfs_t) error
+}
+
+func NewDiskCollector() *DiskCollector {
+	return &DiskCollector{
+		interval: 10 * time.Second,
+		readFile: os.ReadFile,
+		statfs:   syscall.Statfs,
+	}
+}
+
+func (c *DiskCollector) Name() string { return "host:disk" }
+
+func (c *DiskCollector) Interval() time.Duration {
+	if c.interval <= 0 {
+		return 10 * time.Second
+	}
+	return c.interval
+}
+
+func (c *DiskCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
+	if err := ctx.Err(); err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+	}
+
+	b, err := c.readFile("/proc/mounts")
+	if err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/mounts"}, err
+	}
+
+	mounts := parseProcMounts(string(b))
+	if len(mounts) == 0 {
+		// Unusual but treat as degraded rather than hard error.
+		return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "no mounts found"}, nil
+	}
+
+	issues := make([]model.Issue, 0, 8)
+	seenMount := map[string]struct{}{}
+
+	partialErrs := 0
+	for _, m := range mounts {
+		if err := ctx.Err(); err != nil {
+			return issues, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+		}
+		if shouldSkipMount(m) {
+			continue
+		}
+		if _, ok := seenMount[m.MountPoint]; ok {
+			continue
+		}
+		seenMount[m.MountPoint] = struct{}{}
+
+		var st syscall.Statfs_t
+		if err := c.statfs(m.MountPoint, &st); err != nil {
+			partialErrs++
+			continue
+		}
+
+		blockPct, blockFreeBytes := statfsBlockUsedPct(st)
+		inodePct := statfsInodeUsedPct(st)
+
+		pri, ok := diskPriority(blockPct, inodePct)
+		if !ok {
+			continue
+		}
+
+		evidence := map[string]string{
+			"mount":            m.MountPoint,
+			"fstype":           m.FSType,
+			"block_used_pct":   fmt.Sprintf("%.1f", blockPct),
+			"block_free_bytes": strconv.FormatUint(blockFreeBytes, 10),
+		}
+		if inodePct >= 0 {
+			evidence["inode_used_pct"] = fmt.Sprintf("%.1f", inodePct)
+		}
+
+		issues = append(issues, model.Issue{
+			ID:       fmt.Sprintf("host:disk:%s:usage", m.MountPoint),
+			Category: model.CategoryStorage,
+			Priority: pri,
+			Title:    fmt.Sprintf("Disk usage high on %s", m.MountPoint),
+			Details:  "Filesystem space and/or inodes are nearly exhausted.",
+			Evidence: evidence,
+			SuggestedFix: fmt.Sprintf(
+				"Inspect usage:\n  df -h %s\n  df -i %s\nFind large directories:\n  sudo du -xh --max-depth=2 %s | sort -h | tail",
+				m.MountPoint, m.MountPoint, m.MountPoint,
+			),
+		})
+	}
+
+	st := collectors.OKStatus()
+	if partialErrs > 0 {
+		st.Health = collectors.HealthDegraded
+		st.Message = fmt.Sprintf("partial failures: %d mounts", partialErrs)
+	}
+	return issues, st, nil
+}
+
+type procMount struct {
+	Device     string
+	MountPoint string
+	FSType     string
+	Options    string
+}
+
+func parseProcMounts(content string) []procMount {
+	s := bufio.NewScanner(strings.NewReader(content))
+	out := make([]procMount, 0, 32)
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
+			continue
+		}
+		fields := strings.Fields(line)
+		if len(fields) < 3 {
+			continue
+		}
+		m := procMount{
+			Device:     unescapeProcMountsField(fields[0]),
+			MountPoint: unescapeProcMountsField(fields[1]),
+			FSType:     fields[2],
+		}
+		if len(fields) >= 4 {
+			m.Options = fields[3]
+		}
+		out = append(out, m)
+	}
+	return out
+}
+
+// /proc/mounts escapes special characters as octal sequences.
+// The most common one is a space as \040.
+func unescapeProcMountsField(s string) string {
+	replacer := strings.NewReplacer(
+		"\\040", " ",
+		"\\011", "\t",
+		"\\012", "\n",
+		"\\134", "\\",
+	)
+	return replacer.Replace(s)
+}
+
+var pseudoFSTypes = map[string]struct{}{
+	"proc":        {},
+	"sysfs":       {},
+	"tmpfs":       {},
+	"devtmpfs":    {},
+	"devpts":      {},
+	"cgroup":      {},
+	"cgroup2":     {},
+	"pstore":      {},
+	"securityfs":  {},
+	"debugfs":     {},
+	"tracefs":     {},
+	"configfs":    {},
+	"hugetlbfs":   {},
+	"mqueue":      {},
+	"rpc_pipefs":  {},
+	"fusectl":     {},
+	"binfmt_misc": {},
+	"autofs":      {},
+	"bpf":         {},
+	"ramfs":       {},
+	"nsfs":        {},
+	"efivarfs":    {},
+	"overlay":     {}, // common container overlay mounts
+
+	"squashfs":  {}, // typically read-only images
+	"selinuxfs": {},
+	"systemd-1": {},
+	"overlayfs": {}, // (non-standard) conservative skip
+
+	"cgroupfs":        {},
+	"procfs":          {},
+	"fuse.lxcfs":      {},
+	"fuse.gvfsd-fuse": {},
+}
+
+func shouldSkipMount(m procMount) bool {
+	if m.MountPoint == "" {
+		return true
+	}
+	// Filter by fstype.
+	if _, ok := pseudoFSTypes[m.FSType]; ok {
+		return true
+	}
+	// Filter common pseudo mountpoints.
+	if strings.HasPrefix(m.MountPoint, "/proc") || strings.HasPrefix(m.MountPoint, "/sys") {
+		return true
+	}
+	if strings.HasPrefix(m.MountPoint, "/dev") {
+		// /dev itself can be a real mount in some cases, but usually isn't useful for disk pressure.
+		return true
+	}
+	return false
+}
+
+func statfsBlockUsedPct(st syscall.Statfs_t) (usedPct float64, freeBytes uint64) {
+	// Mirror df(1) semantics closely:
+	// total = f_blocks
+	// used  = f_blocks - f_bfree
+	// avail = f_bavail (space available to unprivileged user)
+	// use%  = used / (used + avail)
+	if st.Blocks == 0 {
+		return 0, 0
+	}
+
+	bsize := uint64(st.Bsize)
+	blocks := uint64(st.Blocks)
+	bfree := uint64(st.Bfree)
+	bavail := uint64(st.Bavail)
+
+	usedBlocks := blocks - bfree
+	denom := usedBlocks + bavail
+	if denom == 0 {
+		return 0, 0
+	}
+
+	freeBytes = bavail * bsize
+	usedPct = (float64(usedBlocks) / float64(denom)) * 100.0
+	return usedPct, freeBytes
+}
+
+// statfsInodeUsedPct returns inode used percent. If inodes are unavailable (f_files==0), returns -1.
+func statfsInodeUsedPct(st syscall.Statfs_t) float64 {
+	if st.Files == 0 {
+		return -1
+	}
+	total := float64(st.Files)
+	free := float64(st.Ffree)
+	used := total - free
+	return (used / total) * 100.0
+}
+
+func diskPriority(blockPct, inodePct float64) (model.Priority, bool) {
+	maxPct := blockPct
+	if inodePct > maxPct {
+		maxPct = inodePct
+	}
+	// inodePct may be -1 if not supported; ignore in that case.
+	if inodePct < 0 {
+		maxPct = blockPct
+	}
+
+	switch {
+	case maxPct >= 98.0:
+		return model.PriorityP0, true
+	case maxPct >= 92.0:
+		return model.PriorityP1, true
+	default:
+		return "", false
+	}
+}
+
+var _ collectors.Collector = (*DiskCollector)(nil)