package host import ( "context" "fmt" "os" "runtime" "strconv" "strings" "sync" "time" "tower/internal/collectors" "tower/internal/model" ) // LoadCollector evaluates 1-minute load average normalized by logical CPU count. // // Thresholds (PLAN.md), normalized by CPU count: // - P2 if load1/cpus >= 4.0 sustained 120s // - P1 if load1/cpus >= 6.0 sustained 120s // // NOTE: Linux-specific. // Thread-safe: Collect() can be called concurrently. type LoadCollector struct { interval time.Duration now func() time.Time readFile func(string) ([]byte, error) cpuCount func() int mu sync.Mutex pri model.Priority since time.Time } func NewLoadCollector() *LoadCollector { return &LoadCollector{ interval: 5 * time.Second, now: time.Now, readFile: os.ReadFile, cpuCount: runtime.NumCPU, } } func (c *LoadCollector) Name() string { return "host:load" } func (c *LoadCollector) Interval() time.Duration { if c.interval <= 0 { return 5 * time.Second } return c.interval } func (c *LoadCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) { if err := ctx.Err(); err != nil { return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err } now := c.now() b, err := c.readFile("/proc/loadavg") if err != nil { return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/loadavg"}, err } load1, err := parseProcLoadavgFirst(string(b)) if err != nil { return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "bad /proc/loadavg"}, nil } cpus := c.cpuCount() if cpus <= 0 { cpus = 1 } norm := load1 / float64(cpus) desired, window := desiredLoadPriority(norm) c.mu.Lock() c.pri, c.since = updateSustained(now, c.pri, c.since, desired) pri, since := c.pri, c.since c.mu.Unlock() if pri == "" || since.IsZero() || now.Sub(since) < window { return nil, collectors.OKStatus(), nil } iss := model.Issue{ ID: "host:load:high", Category: model.CategoryPerformance, Priority: pri, Title: "High sustained system load", Details: "The 1-minute load average is high relative to CPU count for a sustained period.", Evidence: map[string]string{ "load1": fmt.Sprintf("%.2f", load1), "cpus": strconv.Itoa(cpus), "load1_per_cpu": fmt.Sprintf("%.2f", norm), "sustained_window": window.String(), }, SuggestedFix: "Investigate CPU hogs:\n top\n ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head\nIf I/O bound (high iowait), check disk/network.\n", } return []model.Issue{iss}, collectors.OKStatus(), nil } func parseProcLoadavgFirst(content string) (float64, error) { // /proc/loadavg format: "1.23 0.70 0.50 1/123 4567". fields := strings.Fields(content) if len(fields) < 1 { return 0, fmt.Errorf("missing fields") } v, err := strconv.ParseFloat(fields[0], 64) if err != nil { return 0, err } return v, nil } func desiredLoadPriority(loadPerCPU float64) (model.Priority, time.Duration) { if loadPerCPU >= 6.0 { return model.PriorityP1, 120 * time.Second } if loadPerCPU >= 4.0 { return model.PriorityP2, 120 * time.Second } return "", 0 } var _ collectors.Collector = (*LoadCollector)(nil)