Files
porthole/internal/collectors/host/load.go
OpenCode Test 1421b4659e feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host
systems. Features include:

Core features:
- Collector framework with concurrent scheduling
- Host collectors: disk, memory, load, network
- Kubernetes collectors: pods, nodes, workloads, events with informers
- Issue deduplication, state management, and resolve-after logic
- Bubble Tea TUI with table view, details pane, and filtering
- JSON export functionality

UX improvements:
- Help overlay with keybindings
- Priority/category filters with visual indicators
- Direct priority jump (0/1/2/3)
- Bulk acknowledge (Shift+A)
- Clipboard copy (y)
- Theme toggle (T)
- Age format toggle (d)
- Wide title toggle (t)
- Vi-style navigation (j/k)
- Home/End jump (g/G)
- Rollup drill-down in details

Robustness:
- Grace period for unreachable clusters
- Rollups for high-volume issues
- Flap suppression
- RBAC error handling

Files: All core application code with tests for host collectors,
engine, store, model, and export packages.
2025-12-24 13:29:51 -08:00

128 lines
3.2 KiB
Go

package host
import (
"context"
"fmt"
"os"
"runtime"
"strconv"
"strings"
"sync"
"time"
"tower/internal/collectors"
"tower/internal/model"
)
// LoadCollector evaluates 1-minute load average normalized by logical CPU count.
//
// Thresholds (PLAN.md), normalized by CPU count:
// - P2 if load1/cpus >= 4.0 sustained 120s
// - P1 if load1/cpus >= 6.0 sustained 120s
//
// NOTE: Linux-specific.
// Thread-safe: Collect() can be called concurrently.
type LoadCollector struct {
interval time.Duration
now func() time.Time
readFile func(string) ([]byte, error)
cpuCount func() int
mu sync.Mutex
pri model.Priority
since time.Time
}
func NewLoadCollector() *LoadCollector {
return &LoadCollector{
interval: 5 * time.Second,
now: time.Now,
readFile: os.ReadFile,
cpuCount: runtime.NumCPU,
}
}
func (c *LoadCollector) Name() string { return "host:load" }
func (c *LoadCollector) Interval() time.Duration {
if c.interval <= 0 {
return 5 * time.Second
}
return c.interval
}
func (c *LoadCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
if err := ctx.Err(); err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
now := c.now()
b, err := c.readFile("/proc/loadavg")
if err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/loadavg"}, err
}
load1, err := parseProcLoadavgFirst(string(b))
if err != nil {
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "bad /proc/loadavg"}, nil
}
cpus := c.cpuCount()
if cpus <= 0 {
cpus = 1
}
norm := load1 / float64(cpus)
desired, window := desiredLoadPriority(norm)
c.mu.Lock()
c.pri, c.since = updateSustained(now, c.pri, c.since, desired)
pri, since := c.pri, c.since
c.mu.Unlock()
if pri == "" || since.IsZero() || now.Sub(since) < window {
return nil, collectors.OKStatus(), nil
}
iss := model.Issue{
ID: "host:load:high",
Category: model.CategoryPerformance,
Priority: pri,
Title: "High sustained system load",
Details: "The 1-minute load average is high relative to CPU count for a sustained period.",
Evidence: map[string]string{
"load1": fmt.Sprintf("%.2f", load1),
"cpus": strconv.Itoa(cpus),
"load1_per_cpu": fmt.Sprintf("%.2f", norm),
"sustained_window": window.String(),
},
SuggestedFix: "Investigate CPU hogs:\n top\n ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head\nIf I/O bound (high iowait), check disk/network.\n",
}
return []model.Issue{iss}, collectors.OKStatus(), nil
}
func parseProcLoadavgFirst(content string) (float64, error) {
// /proc/loadavg format: "1.23 0.70 0.50 1/123 4567".
fields := strings.Fields(content)
if len(fields) < 1 {
return 0, fmt.Errorf("missing fields")
}
v, err := strconv.ParseFloat(fields[0], 64)
if err != nil {
return 0, err
}
return v, nil
}
func desiredLoadPriority(loadPerCPU float64) (model.Priority, time.Duration) {
if loadPerCPU >= 6.0 {
return model.PriorityP1, 120 * time.Second
}
if loadPerCPU >= 4.0 {
return model.PriorityP2, 120 * time.Second
}
return "", 0
}
var _ collectors.Collector = (*LoadCollector)(nil)