feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
This commit is contained in:
127
internal/collectors/host/load.go
Normal file
127
internal/collectors/host/load.go
Normal file
@@ -0,0 +1,127 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// LoadCollector evaluates 1-minute load average normalized by logical CPU count.
|
||||
//
|
||||
// Thresholds (PLAN.md), normalized by CPU count:
|
||||
// - P2 if load1/cpus >= 4.0 sustained 120s
|
||||
// - P1 if load1/cpus >= 6.0 sustained 120s
|
||||
//
|
||||
// NOTE: Linux-specific.
|
||||
// Thread-safe: Collect() can be called concurrently.
|
||||
type LoadCollector struct {
|
||||
interval time.Duration
|
||||
|
||||
now func() time.Time
|
||||
readFile func(string) ([]byte, error)
|
||||
cpuCount func() int
|
||||
|
||||
mu sync.Mutex
|
||||
|
||||
pri model.Priority
|
||||
since time.Time
|
||||
}
|
||||
|
||||
func NewLoadCollector() *LoadCollector {
|
||||
return &LoadCollector{
|
||||
interval: 5 * time.Second,
|
||||
now: time.Now,
|
||||
readFile: os.ReadFile,
|
||||
cpuCount: runtime.NumCPU,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *LoadCollector) Name() string { return "host:load" }
|
||||
|
||||
func (c *LoadCollector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 5 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *LoadCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
now := c.now()
|
||||
b, err := c.readFile("/proc/loadavg")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/loadavg"}, err
|
||||
}
|
||||
|
||||
load1, err := parseProcLoadavgFirst(string(b))
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "bad /proc/loadavg"}, nil
|
||||
}
|
||||
|
||||
cpus := c.cpuCount()
|
||||
if cpus <= 0 {
|
||||
cpus = 1
|
||||
}
|
||||
norm := load1 / float64(cpus)
|
||||
desired, window := desiredLoadPriority(norm)
|
||||
c.mu.Lock()
|
||||
c.pri, c.since = updateSustained(now, c.pri, c.since, desired)
|
||||
pri, since := c.pri, c.since
|
||||
c.mu.Unlock()
|
||||
|
||||
if pri == "" || since.IsZero() || now.Sub(since) < window {
|
||||
return nil, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
iss := model.Issue{
|
||||
ID: "host:load:high",
|
||||
Category: model.CategoryPerformance,
|
||||
Priority: pri,
|
||||
Title: "High sustained system load",
|
||||
Details: "The 1-minute load average is high relative to CPU count for a sustained period.",
|
||||
Evidence: map[string]string{
|
||||
"load1": fmt.Sprintf("%.2f", load1),
|
||||
"cpus": strconv.Itoa(cpus),
|
||||
"load1_per_cpu": fmt.Sprintf("%.2f", norm),
|
||||
"sustained_window": window.String(),
|
||||
},
|
||||
SuggestedFix: "Investigate CPU hogs:\n top\n ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head\nIf I/O bound (high iowait), check disk/network.\n",
|
||||
}
|
||||
return []model.Issue{iss}, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
func parseProcLoadavgFirst(content string) (float64, error) {
|
||||
// /proc/loadavg format: "1.23 0.70 0.50 1/123 4567".
|
||||
fields := strings.Fields(content)
|
||||
if len(fields) < 1 {
|
||||
return 0, fmt.Errorf("missing fields")
|
||||
}
|
||||
v, err := strconv.ParseFloat(fields[0], 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
func desiredLoadPriority(loadPerCPU float64) (model.Priority, time.Duration) {
|
||||
if loadPerCPU >= 6.0 {
|
||||
return model.PriorityP1, 120 * time.Second
|
||||
}
|
||||
if loadPerCPU >= 4.0 {
|
||||
return model.PriorityP2, 120 * time.Second
|
||||
}
|
||||
return "", 0
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*LoadCollector)(nil)
|
||||
Reference in New Issue
Block a user