feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
This commit is contained in:
205
internal/collectors/host/mem.go
Normal file
205
internal/collectors/host/mem.go
Normal file
@@ -0,0 +1,205 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// MemCollector checks MemAvailable and swap pressure from /proc/meminfo.
|
||||
//
|
||||
// Thresholds (PLAN.md):
|
||||
// Memory (MemAvailable as % of MemTotal):
|
||||
// - P2 if <= 15% sustained 60s
|
||||
// - P1 if <= 10% sustained 60s
|
||||
// - P0 if <= 5% sustained 30s
|
||||
//
|
||||
// Swap pressure (only if RAM is also tight):
|
||||
// - P1 if swap used >= 50% AND MemAvailable <= 10% sustained 60s
|
||||
// - P0 if swap used >= 80% AND MemAvailable <= 5% sustained 30s
|
||||
//
|
||||
// Emits up to two issues:
|
||||
// - host:mem:available
|
||||
// - host:mem:swap
|
||||
//
|
||||
// NOTE: Linux-specific.
|
||||
// Thread-safe: Collect() can be called concurrently.
|
||||
type MemCollector struct {
|
||||
interval time.Duration
|
||||
|
||||
now func() time.Time
|
||||
readFile func(string) ([]byte, error)
|
||||
|
||||
mu sync.Mutex
|
||||
|
||||
memPri model.Priority
|
||||
memSince time.Time
|
||||
|
||||
swapPri model.Priority
|
||||
swapSince time.Time
|
||||
}
|
||||
|
||||
func NewMemCollector() *MemCollector {
|
||||
return &MemCollector{
|
||||
interval: 5 * time.Second,
|
||||
now: time.Now,
|
||||
readFile: os.ReadFile,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *MemCollector) Name() string { return "host:mem" }
|
||||
|
||||
func (c *MemCollector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 5 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *MemCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
now := c.now()
|
||||
b, err := c.readFile("/proc/meminfo")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/meminfo"}, err
|
||||
}
|
||||
|
||||
mi := parseProcMeminfo(string(b))
|
||||
memTotalKB, okT := mi["MemTotal"]
|
||||
memAvailKB, okA := mi["MemAvailable"]
|
||||
if !okT || !okA || memTotalKB <= 0 {
|
||||
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "missing MemTotal/MemAvailable"}, nil
|
||||
}
|
||||
|
||||
memAvailPct := (float64(memAvailKB) / float64(memTotalKB)) * 100.0
|
||||
|
||||
desiredMemPri, memWindow := desiredMemPriority(memAvailPct)
|
||||
c.mu.Lock()
|
||||
c.memPri, c.memSince = updateSustained(now, c.memPri, c.memSince, desiredMemPri)
|
||||
memPri, memSince := c.memPri, c.memSince
|
||||
c.mu.Unlock()
|
||||
|
||||
issues := make([]model.Issue, 0, 2)
|
||||
if memPri != "" && !memSince.IsZero() && now.Sub(memSince) >= memWindow {
|
||||
issues = append(issues, model.Issue{
|
||||
ID: "host:mem:available",
|
||||
Category: model.CategoryMemory,
|
||||
Priority: memPri,
|
||||
Title: "Low available memory",
|
||||
Details: "MemAvailable is low and has remained low for a sustained period.",
|
||||
Evidence: map[string]string{
|
||||
"mem_available_kb": strconv.FormatInt(memAvailKB, 10),
|
||||
"mem_total_kb": strconv.FormatInt(memTotalKB, 10),
|
||||
"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
|
||||
},
|
||||
SuggestedFix: "Identify memory hogs:\n free -h\n ps aux --sort=-rss | head\nConsider restarting runaway processes or adding RAM.",
|
||||
})
|
||||
}
|
||||
|
||||
swapTotalKB, okST := mi["SwapTotal"]
|
||||
swapFreeKB, okSF := mi["SwapFree"]
|
||||
swapUsedPct := 0.0
|
||||
if okST && okSF && swapTotalKB > 0 {
|
||||
swapUsedKB := swapTotalKB - swapFreeKB
|
||||
swapUsedPct = (float64(swapUsedKB) / float64(swapTotalKB)) * 100.0
|
||||
}
|
||||
|
||||
desiredSwapPri, swapWindow := desiredSwapPriority(memAvailPct, swapTotalKB, swapUsedPct)
|
||||
c.mu.Lock()
|
||||
c.swapPri, c.swapSince = updateSustained(now, c.swapPri, c.swapSince, desiredSwapPri)
|
||||
swapPri, swapSince := c.swapPri, c.swapSince
|
||||
c.mu.Unlock()
|
||||
if swapPri != "" && !swapSince.IsZero() && now.Sub(swapSince) >= swapWindow {
|
||||
issues = append(issues, model.Issue{
|
||||
ID: "host:mem:swap",
|
||||
Category: model.CategoryMemory,
|
||||
Priority: swapPri,
|
||||
Title: "High swap usage with low RAM",
|
||||
Details: "Swap usage is high while available RAM is also low, indicating memory pressure.",
|
||||
Evidence: map[string]string{
|
||||
"swap_used_pct": fmt.Sprintf("%.1f", swapUsedPct),
|
||||
"swap_total_kb": strconv.FormatInt(swapTotalKB, 10),
|
||||
"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
|
||||
},
|
||||
SuggestedFix: "Find swapping processes:\n vmstat 1\n smem -r 2>/dev/null || true\nConsider reducing memory usage or increasing RAM/swap.",
|
||||
})
|
||||
}
|
||||
|
||||
return issues, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
func parseProcMeminfo(content string) map[string]int64 {
|
||||
out := map[string]int64{}
|
||||
s := bufio.NewScanner(strings.NewReader(content))
|
||||
for s.Scan() {
|
||||
line := strings.TrimSpace(s.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
// Example: "MemAvailable: 12345 kB"
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSuffix(fields[0], ":")
|
||||
v, err := strconv.ParseInt(fields[1], 10, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[key] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func desiredMemPriority(memAvailPct float64) (model.Priority, time.Duration) {
|
||||
switch {
|
||||
case memAvailPct <= 5.0:
|
||||
return model.PriorityP0, 30 * time.Second
|
||||
case memAvailPct <= 10.0:
|
||||
return model.PriorityP1, 60 * time.Second
|
||||
case memAvailPct <= 15.0:
|
||||
return model.PriorityP2, 60 * time.Second
|
||||
default:
|
||||
return "", 0
|
||||
}
|
||||
}
|
||||
|
||||
func desiredSwapPriority(memAvailPct float64, swapTotalKB int64, swapUsedPct float64) (model.Priority, time.Duration) {
|
||||
if swapTotalKB <= 0 {
|
||||
return "", 0
|
||||
}
|
||||
// Only alert on swap when RAM is also tight.
|
||||
switch {
|
||||
case swapUsedPct >= 80.0 && memAvailPct <= 5.0:
|
||||
return model.PriorityP0, 30 * time.Second
|
||||
case swapUsedPct >= 50.0 && memAvailPct <= 10.0:
|
||||
return model.PriorityP1, 60 * time.Second
|
||||
default:
|
||||
return "", 0
|
||||
}
|
||||
}
|
||||
|
||||
// updateSustained updates current severity and its since timestamp.
|
||||
// If desired is empty, it clears the state.
|
||||
func updateSustained(now time.Time, current model.Priority, since time.Time, desired model.Priority) (model.Priority, time.Time) {
|
||||
if desired == "" {
|
||||
return "", time.Time{}
|
||||
}
|
||||
if current != desired || since.IsZero() {
|
||||
return desired, now
|
||||
}
|
||||
return current, since
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*MemCollector)(nil)
|
||||
Reference in New Issue
Block a user