feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host
systems. Features include:

Core features:
- Collector framework with concurrent scheduling
- Host collectors: disk, memory, load, network
- Kubernetes collectors: pods, nodes, workloads, events with informers
- Issue deduplication, state management, and resolve-after logic
- Bubble Tea TUI with table view, details pane, and filtering
- JSON export functionality

UX improvements:
- Help overlay with keybindings
- Priority/category filters with visual indicators
- Direct priority jump (0/1/2/3)
- Bulk acknowledge (Shift+A)
- Clipboard copy (y)
- Theme toggle (T)
- Age format toggle (d)
- Wide title toggle (t)
- Vi-style navigation (j/k)
- Home/End jump (g/G)
- Rollup drill-down in details

Robustness:
- Grace period for unreachable clusters
- Rollups for high-volume issues
- Flap suppression
- RBAC error handling

Files: All core application code with tests for host collectors,
engine, store, model, and export packages.
This commit is contained in:
OpenCode Test
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions
+45
View File
@@ -0,0 +1,45 @@
package collectors
import (
"context"
"time"
"tower/internal/model"
)
type Health string
const (
HealthOK Health = "OK"
HealthDegraded Health = "DEGRADED"
HealthError Health = "ERROR"
)
// Status describes collector health for the current tick.
//
// Collectors should return Status even when returning an error,
// so the UI can show useful context.
//
// LastSuccess should be the collector's most recent successful collect time.
// When unknown, it may be the zero value.
//
// Message should be short and human-friendly.
type Status struct {
Health Health `json:"health"`
Message string `json:"message,omitempty"`
LastSuccess time.Time `json:"last_success,omitempty"`
}
func OKStatus() Status {
return Status{Health: HealthOK}
}
// Collector returns "currently true" issues for this tick.
//
// The store is responsible for dedupe, lifecycle, and resolve-after.
// Collectors must respect ctx cancellation.
type Collector interface {
Name() string
Interval() time.Duration
Collect(ctx context.Context) ([]model.Issue, Status, error)
}
+287
View File
@@ -0,0 +1,287 @@
package host
import (
"bufio"
"context"
"fmt"
"os"
"strconv"
"strings"
"syscall"
"time"
"tower/internal/collectors"
"tower/internal/model"
)
// DiskCollector checks filesystem block + inode pressure across mounts.
//
// It reads /proc/mounts to discover mounts and then uses statfs to compute usage.
// Pseudo filesystems are filtered out.
//
// Thresholds (PLAN.md):
// - P1 if blocks OR inodes >= 92%
// - P0 if blocks OR inodes >= 98%
//
// Issues are emitted per mount (one issue that includes both block+inode usage).
//
// NOTE: This collector is Linux-specific.
type DiskCollector struct {
interval time.Duration
readFile func(string) ([]byte, error)
statfs func(path string, st *syscall.Statfs_t) error
}
func NewDiskCollector() *DiskCollector {
return &DiskCollector{
interval: 10 * time.Second,
readFile: os.ReadFile,
statfs: syscall.Statfs,
}
}
func (c *DiskCollector) Name() string { return "host:disk" }
func (c *DiskCollector) Interval() time.Duration {
if c.interval <= 0 {
return 10 * time.Second
}
return c.interval
}
func (c *DiskCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
if err := ctx.Err(); err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
b, err := c.readFile("/proc/mounts")
if err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/mounts"}, err
}
mounts := parseProcMounts(string(b))
if len(mounts) == 0 {
// Unusual but treat as degraded rather than hard error.
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "no mounts found"}, nil
}
issues := make([]model.Issue, 0, 8)
seenMount := map[string]struct{}{}
partialErrs := 0
for _, m := range mounts {
if err := ctx.Err(); err != nil {
return issues, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
if shouldSkipMount(m) {
continue
}
if _, ok := seenMount[m.MountPoint]; ok {
continue
}
seenMount[m.MountPoint] = struct{}{}
var st syscall.Statfs_t
if err := c.statfs(m.MountPoint, &st); err != nil {
partialErrs++
continue
}
blockPct, blockFreeBytes := statfsBlockUsedPct(st)
inodePct := statfsInodeUsedPct(st)
pri, ok := diskPriority(blockPct, inodePct)
if !ok {
continue
}
evidence := map[string]string{
"mount": m.MountPoint,
"fstype": m.FSType,
"block_used_pct": fmt.Sprintf("%.1f", blockPct),
"block_free_bytes": strconv.FormatUint(blockFreeBytes, 10),
}
if inodePct >= 0 {
evidence["inode_used_pct"] = fmt.Sprintf("%.1f", inodePct)
}
issues = append(issues, model.Issue{
ID: fmt.Sprintf("host:disk:%s:usage", m.MountPoint),
Category: model.CategoryStorage,
Priority: pri,
Title: fmt.Sprintf("Disk usage high on %s", m.MountPoint),
Details: "Filesystem space and/or inodes are nearly exhausted.",
Evidence: evidence,
SuggestedFix: fmt.Sprintf(
"Inspect usage:\n df -h %s\n df -i %s\nFind large directories:\n sudo du -xh --max-depth=2 %s | sort -h | tail",
m.MountPoint, m.MountPoint, m.MountPoint,
),
})
}
st := collectors.OKStatus()
if partialErrs > 0 {
st.Health = collectors.HealthDegraded
st.Message = fmt.Sprintf("partial failures: %d mounts", partialErrs)
}
return issues, st, nil
}
type procMount struct {
Device string
MountPoint string
FSType string
Options string
}
func parseProcMounts(content string) []procMount {
s := bufio.NewScanner(strings.NewReader(content))
out := make([]procMount, 0, 32)
for s.Scan() {
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
fields := strings.Fields(line)
if len(fields) < 3 {
continue
}
m := procMount{
Device: unescapeProcMountsField(fields[0]),
MountPoint: unescapeProcMountsField(fields[1]),
FSType: fields[2],
}
if len(fields) >= 4 {
m.Options = fields[3]
}
out = append(out, m)
}
return out
}
// /proc/mounts escapes special characters as octal sequences.
// The most common one is a space as \040.
func unescapeProcMountsField(s string) string {
replacer := strings.NewReplacer(
"\\040", " ",
"\\011", "\t",
"\\012", "\n",
"\\134", "\\",
)
return replacer.Replace(s)
}
var pseudoFSTypes = map[string]struct{}{
"proc": {},
"sysfs": {},
"tmpfs": {},
"devtmpfs": {},
"devpts": {},
"cgroup": {},
"cgroup2": {},
"pstore": {},
"securityfs": {},
"debugfs": {},
"tracefs": {},
"configfs": {},
"hugetlbfs": {},
"mqueue": {},
"rpc_pipefs": {},
"fusectl": {},
"binfmt_misc": {},
"autofs": {},
"bpf": {},
"ramfs": {},
"nsfs": {},
"efivarfs": {},
"overlay": {}, // common container overlay mounts
"squashfs": {}, // typically read-only images
"selinuxfs": {},
"systemd-1": {},
"overlayfs": {}, // (non-standard) conservative skip
"cgroupfs": {},
"procfs": {},
"fuse.lxcfs": {},
"fuse.gvfsd-fuse": {},
}
func shouldSkipMount(m procMount) bool {
if m.MountPoint == "" {
return true
}
// Filter by fstype.
if _, ok := pseudoFSTypes[m.FSType]; ok {
return true
}
// Filter common pseudo mountpoints.
if strings.HasPrefix(m.MountPoint, "/proc") || strings.HasPrefix(m.MountPoint, "/sys") {
return true
}
if strings.HasPrefix(m.MountPoint, "/dev") {
// /dev itself can be a real mount in some cases, but usually isn't useful for disk pressure.
return true
}
return false
}
func statfsBlockUsedPct(st syscall.Statfs_t) (usedPct float64, freeBytes uint64) {
// Mirror df(1) semantics closely:
// total = f_blocks
// used = f_blocks - f_bfree
// avail = f_bavail (space available to unprivileged user)
// use% = used / (used + avail)
if st.Blocks == 0 {
return 0, 0
}
bsize := uint64(st.Bsize)
blocks := uint64(st.Blocks)
bfree := uint64(st.Bfree)
bavail := uint64(st.Bavail)
usedBlocks := blocks - bfree
denom := usedBlocks + bavail
if denom == 0 {
return 0, 0
}
freeBytes = bavail * bsize
usedPct = (float64(usedBlocks) / float64(denom)) * 100.0
return usedPct, freeBytes
}
// statfsInodeUsedPct returns inode used percent. If inodes are unavailable (f_files==0), returns -1.
func statfsInodeUsedPct(st syscall.Statfs_t) float64 {
if st.Files == 0 {
return -1
}
total := float64(st.Files)
free := float64(st.Ffree)
used := total - free
return (used / total) * 100.0
}
func diskPriority(blockPct, inodePct float64) (model.Priority, bool) {
maxPct := blockPct
if inodePct > maxPct {
maxPct = inodePct
}
// inodePct may be -1 if not supported; ignore in that case.
if inodePct < 0 {
maxPct = blockPct
}
switch {
case maxPct >= 98.0:
return model.PriorityP0, true
case maxPct >= 92.0:
return model.PriorityP1, true
default:
return "", false
}
}
var _ collectors.Collector = (*DiskCollector)(nil)
+80
View File
@@ -0,0 +1,80 @@
package host
import (
"syscall"
"testing"
)
func TestParseProcMounts_UnescapesAndParses(t *testing.T) {
in := "dev1 / ext4 rw 0 0\n" +
"dev2 /path\\040with\\040space xfs rw 0 0\n" +
"badline\n"
ms := parseProcMounts(in)
if len(ms) != 2 {
t.Fatalf("expected 2 mounts, got %d", len(ms))
}
if ms[0].MountPoint != "/" || ms[0].FSType != "ext4" {
t.Fatalf("unexpected first mount: %+v", ms[0])
}
if ms[1].MountPoint != "/path with space" {
t.Fatalf("expected unescaped mountpoint, got %q", ms[1].MountPoint)
}
}
func TestShouldSkipMount_FiltersPseudo(t *testing.T) {
cases := []procMount{
{MountPoint: "/proc", FSType: "proc"},
{MountPoint: "/sys", FSType: "sysfs"},
{MountPoint: "/dev", FSType: "tmpfs"},
{MountPoint: "/dev/shm", FSType: "tmpfs"},
}
for _, c := range cases {
if !shouldSkipMount(c) {
t.Fatalf("expected skip for %+v", c)
}
}
if shouldSkipMount(procMount{MountPoint: "/home", FSType: "ext4"}) {
t.Fatalf("did not expect skip for /home ext4")
}
}
func TestDiskPriority(t *testing.T) {
if p, ok := diskPriority(91.9, -1); ok {
t.Fatalf("expected no issue, got %v", p)
}
if p, ok := diskPriority(92.0, -1); !ok || p != "P1" {
t.Fatalf("expected P1 at 92%%, got %v ok=%v", p, ok)
}
if p, ok := diskPriority(97.9, 98.0); !ok || p != "P0" {
t.Fatalf("expected P0 if either crosses 98%%, got %v ok=%v", p, ok)
}
}
func TestStatfsCalculations(t *testing.T) {
st := syscall.Statfs_t{}
st.Bsize = 1
st.Blocks = 100
st.Bfree = 8
st.Bavail = 8
pct, free := statfsBlockUsedPct(st)
if free != 8 {
t.Fatalf("expected free=8 bytes, got %d", free)
}
if pct < 91.9 || pct > 92.1 {
t.Fatalf("expected ~92%% used, got %f", pct)
}
st.Files = 100
st.Ffree = 2
ipct := statfsInodeUsedPct(st)
if ipct < 97.9 || ipct > 98.1 {
t.Fatalf("expected ~98%% inode used, got %f", ipct)
}
st.Files = 0
if statfsInodeUsedPct(st) != -1 {
t.Fatalf("expected -1 when inode info unavailable")
}
}
+127
View File
@@ -0,0 +1,127 @@
package host
import (
"context"
"fmt"
"os"
"runtime"
"strconv"
"strings"
"sync"
"time"
"tower/internal/collectors"
"tower/internal/model"
)
// LoadCollector evaluates 1-minute load average normalized by logical CPU count.
//
// Thresholds (PLAN.md), normalized by CPU count:
// - P2 if load1/cpus >= 4.0 sustained 120s
// - P1 if load1/cpus >= 6.0 sustained 120s
//
// NOTE: Linux-specific.
// Thread-safe: Collect() can be called concurrently.
type LoadCollector struct {
interval time.Duration
now func() time.Time
readFile func(string) ([]byte, error)
cpuCount func() int
mu sync.Mutex
pri model.Priority
since time.Time
}
func NewLoadCollector() *LoadCollector {
return &LoadCollector{
interval: 5 * time.Second,
now: time.Now,
readFile: os.ReadFile,
cpuCount: runtime.NumCPU,
}
}
func (c *LoadCollector) Name() string { return "host:load" }
func (c *LoadCollector) Interval() time.Duration {
if c.interval <= 0 {
return 5 * time.Second
}
return c.interval
}
func (c *LoadCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
if err := ctx.Err(); err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
now := c.now()
b, err := c.readFile("/proc/loadavg")
if err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/loadavg"}, err
}
load1, err := parseProcLoadavgFirst(string(b))
if err != nil {
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "bad /proc/loadavg"}, nil
}
cpus := c.cpuCount()
if cpus <= 0 {
cpus = 1
}
norm := load1 / float64(cpus)
desired, window := desiredLoadPriority(norm)
c.mu.Lock()
c.pri, c.since = updateSustained(now, c.pri, c.since, desired)
pri, since := c.pri, c.since
c.mu.Unlock()
if pri == "" || since.IsZero() || now.Sub(since) < window {
return nil, collectors.OKStatus(), nil
}
iss := model.Issue{
ID: "host:load:high",
Category: model.CategoryPerformance,
Priority: pri,
Title: "High sustained system load",
Details: "The 1-minute load average is high relative to CPU count for a sustained period.",
Evidence: map[string]string{
"load1": fmt.Sprintf("%.2f", load1),
"cpus": strconv.Itoa(cpus),
"load1_per_cpu": fmt.Sprintf("%.2f", norm),
"sustained_window": window.String(),
},
SuggestedFix: "Investigate CPU hogs:\n top\n ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head\nIf I/O bound (high iowait), check disk/network.\n",
}
return []model.Issue{iss}, collectors.OKStatus(), nil
}
func parseProcLoadavgFirst(content string) (float64, error) {
// /proc/loadavg format: "1.23 0.70 0.50 1/123 4567".
fields := strings.Fields(content)
if len(fields) < 1 {
return 0, fmt.Errorf("missing fields")
}
v, err := strconv.ParseFloat(fields[0], 64)
if err != nil {
return 0, err
}
return v, nil
}
func desiredLoadPriority(loadPerCPU float64) (model.Priority, time.Duration) {
if loadPerCPU >= 6.0 {
return model.PriorityP1, 120 * time.Second
}
if loadPerCPU >= 4.0 {
return model.PriorityP2, 120 * time.Second
}
return "", 0
}
var _ collectors.Collector = (*LoadCollector)(nil)
+48
View File
@@ -0,0 +1,48 @@
package host
import (
"testing"
"time"
"tower/internal/model"
)
func TestParseProcLoadavgFirst(t *testing.T) {
v, err := parseProcLoadavgFirst("1.23 0.70 0.50 1/123 4567\n")
if err != nil {
t.Fatalf("unexpected err: %v", err)
}
if v < 1.229 || v > 1.231 {
t.Fatalf("expected 1.23, got %v", v)
}
if _, err := parseProcLoadavgFirst("\n"); err == nil {
t.Fatalf("expected error")
}
}
func TestDesiredLoadPriority(t *testing.T) {
p, w := desiredLoadPriority(3.99)
if p != "" || w != 0 {
t.Fatalf("expected none")
}
p, w = desiredLoadPriority(4.0)
if p != model.PriorityP2 || w != 120*time.Second {
t.Fatalf("expected P2/120s")
}
p, w = desiredLoadPriority(6.0)
if p != model.PriorityP1 || w != 120*time.Second {
t.Fatalf("expected P1/120s")
}
}
func TestUpdateSustainedWorksForLoadToo(t *testing.T) {
now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
p, since := updateSustained(now, "", time.Time{}, model.PriorityP2)
if p != model.PriorityP2 || !since.Equal(now) {
t.Fatalf("expected set")
}
p2, since2 := updateSustained(now.Add(10*time.Second), p, since, model.PriorityP2)
if p2 != model.PriorityP2 || !since2.Equal(since) {
t.Fatalf("expected unchanged")
}
}
+205
View File
@@ -0,0 +1,205 @@
package host
import (
"bufio"
"context"
"fmt"
"os"
"strconv"
"strings"
"sync"
"time"
"tower/internal/collectors"
"tower/internal/model"
)
// MemCollector checks MemAvailable and swap pressure from /proc/meminfo.
//
// Thresholds (PLAN.md):
// Memory (MemAvailable as % of MemTotal):
// - P2 if <= 15% sustained 60s
// - P1 if <= 10% sustained 60s
// - P0 if <= 5% sustained 30s
//
// Swap pressure (only if RAM is also tight):
// - P1 if swap used >= 50% AND MemAvailable <= 10% sustained 60s
// - P0 if swap used >= 80% AND MemAvailable <= 5% sustained 30s
//
// Emits up to two issues:
// - host:mem:available
// - host:mem:swap
//
// NOTE: Linux-specific.
// Thread-safe: Collect() can be called concurrently.
type MemCollector struct {
interval time.Duration
now func() time.Time
readFile func(string) ([]byte, error)
mu sync.Mutex
memPri model.Priority
memSince time.Time
swapPri model.Priority
swapSince time.Time
}
func NewMemCollector() *MemCollector {
return &MemCollector{
interval: 5 * time.Second,
now: time.Now,
readFile: os.ReadFile,
}
}
func (c *MemCollector) Name() string { return "host:mem" }
func (c *MemCollector) Interval() time.Duration {
if c.interval <= 0 {
return 5 * time.Second
}
return c.interval
}
func (c *MemCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
if err := ctx.Err(); err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
now := c.now()
b, err := c.readFile("/proc/meminfo")
if err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/meminfo"}, err
}
mi := parseProcMeminfo(string(b))
memTotalKB, okT := mi["MemTotal"]
memAvailKB, okA := mi["MemAvailable"]
if !okT || !okA || memTotalKB <= 0 {
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "missing MemTotal/MemAvailable"}, nil
}
memAvailPct := (float64(memAvailKB) / float64(memTotalKB)) * 100.0
desiredMemPri, memWindow := desiredMemPriority(memAvailPct)
c.mu.Lock()
c.memPri, c.memSince = updateSustained(now, c.memPri, c.memSince, desiredMemPri)
memPri, memSince := c.memPri, c.memSince
c.mu.Unlock()
issues := make([]model.Issue, 0, 2)
if memPri != "" && !memSince.IsZero() && now.Sub(memSince) >= memWindow {
issues = append(issues, model.Issue{
ID: "host:mem:available",
Category: model.CategoryMemory,
Priority: memPri,
Title: "Low available memory",
Details: "MemAvailable is low and has remained low for a sustained period.",
Evidence: map[string]string{
"mem_available_kb": strconv.FormatInt(memAvailKB, 10),
"mem_total_kb": strconv.FormatInt(memTotalKB, 10),
"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
},
SuggestedFix: "Identify memory hogs:\n free -h\n ps aux --sort=-rss | head\nConsider restarting runaway processes or adding RAM.",
})
}
swapTotalKB, okST := mi["SwapTotal"]
swapFreeKB, okSF := mi["SwapFree"]
swapUsedPct := 0.0
if okST && okSF && swapTotalKB > 0 {
swapUsedKB := swapTotalKB - swapFreeKB
swapUsedPct = (float64(swapUsedKB) / float64(swapTotalKB)) * 100.0
}
desiredSwapPri, swapWindow := desiredSwapPriority(memAvailPct, swapTotalKB, swapUsedPct)
c.mu.Lock()
c.swapPri, c.swapSince = updateSustained(now, c.swapPri, c.swapSince, desiredSwapPri)
swapPri, swapSince := c.swapPri, c.swapSince
c.mu.Unlock()
if swapPri != "" && !swapSince.IsZero() && now.Sub(swapSince) >= swapWindow {
issues = append(issues, model.Issue{
ID: "host:mem:swap",
Category: model.CategoryMemory,
Priority: swapPri,
Title: "High swap usage with low RAM",
Details: "Swap usage is high while available RAM is also low, indicating memory pressure.",
Evidence: map[string]string{
"swap_used_pct": fmt.Sprintf("%.1f", swapUsedPct),
"swap_total_kb": strconv.FormatInt(swapTotalKB, 10),
"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
},
SuggestedFix: "Find swapping processes:\n vmstat 1\n smem -r 2>/dev/null || true\nConsider reducing memory usage or increasing RAM/swap.",
})
}
return issues, collectors.OKStatus(), nil
}
func parseProcMeminfo(content string) map[string]int64 {
out := map[string]int64{}
s := bufio.NewScanner(strings.NewReader(content))
for s.Scan() {
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
// Example: "MemAvailable: 12345 kB"
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
key := strings.TrimSuffix(fields[0], ":")
v, err := strconv.ParseInt(fields[1], 10, 64)
if err != nil {
continue
}
out[key] = v
}
return out
}
func desiredMemPriority(memAvailPct float64) (model.Priority, time.Duration) {
switch {
case memAvailPct <= 5.0:
return model.PriorityP0, 30 * time.Second
case memAvailPct <= 10.0:
return model.PriorityP1, 60 * time.Second
case memAvailPct <= 15.0:
return model.PriorityP2, 60 * time.Second
default:
return "", 0
}
}
func desiredSwapPriority(memAvailPct float64, swapTotalKB int64, swapUsedPct float64) (model.Priority, time.Duration) {
if swapTotalKB <= 0 {
return "", 0
}
// Only alert on swap when RAM is also tight.
switch {
case swapUsedPct >= 80.0 && memAvailPct <= 5.0:
return model.PriorityP0, 30 * time.Second
case swapUsedPct >= 50.0 && memAvailPct <= 10.0:
return model.PriorityP1, 60 * time.Second
default:
return "", 0
}
}
// updateSustained updates current severity and its since timestamp.
// If desired is empty, it clears the state.
func updateSustained(now time.Time, current model.Priority, since time.Time, desired model.Priority) (model.Priority, time.Time) {
if desired == "" {
return "", time.Time{}
}
if current != desired || since.IsZero() {
return desired, now
}
return current, since
}
var _ collectors.Collector = (*MemCollector)(nil)
+83
View File
@@ -0,0 +1,83 @@
package host
import (
"testing"
"time"
"tower/internal/model"
)
func TestParseProcMeminfo(t *testing.T) {
in := "MemTotal: 8000000 kB\nMemAvailable: 800000 kB\nSwapTotal: 2000000 kB\nSwapFree: 500000 kB\n"
m := parseProcMeminfo(in)
if m["MemTotal"] != 8000000 {
t.Fatalf("MemTotal mismatch: %d", m["MemTotal"])
}
if m["MemAvailable"] != 800000 {
t.Fatalf("MemAvailable mismatch: %d", m["MemAvailable"])
}
}
func TestDesiredMemPriority(t *testing.T) {
p, w := desiredMemPriority(16.0)
if p != "" || w != 0 {
t.Fatalf("expected none")
}
p, w = desiredMemPriority(15.0)
if p != model.PriorityP2 || w != 60*time.Second {
t.Fatalf("expected P2/60s got %v/%v", p, w)
}
p, w = desiredMemPriority(10.0)
if p != model.PriorityP1 {
t.Fatalf("expected P1 got %v", p)
}
p, w = desiredMemPriority(5.0)
if p != model.PriorityP0 || w != 30*time.Second {
t.Fatalf("expected P0/30s got %v/%v", p, w)
}
}
func TestDesiredSwapPriority(t *testing.T) {
// No swap configured.
p, _ := desiredSwapPriority(4.0, 0, 90.0)
if p != "" {
t.Fatalf("expected none when SwapTotal=0")
}
p, w := desiredSwapPriority(4.0, 1000, 80.0)
if p != model.PriorityP0 || w != 30*time.Second {
t.Fatalf("expected P0/30s got %v/%v", p, w)
}
p, w = desiredSwapPriority(9.9, 1000, 50.0)
if p != model.PriorityP1 || w != 60*time.Second {
t.Fatalf("expected P1/60s got %v/%v", p, w)
}
// Swap high but RAM not tight => no issue.
p, _ = desiredSwapPriority(20.0, 1000, 90.0)
if p != "" {
t.Fatalf("expected none when RAM not tight")
}
}
func TestUpdateSustained(t *testing.T) {
now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
p, since := updateSustained(now, "", time.Time{}, model.PriorityP1)
if p != model.PriorityP1 || !since.Equal(now) {
t.Fatalf("expected set to P1 at now")
}
p2, since2 := updateSustained(now.Add(1*time.Second), p, since, model.PriorityP1)
if p2 != model.PriorityP1 || !since2.Equal(since) {
t.Fatalf("expected unchanged since")
}
p3, since3 := updateSustained(now.Add(2*time.Second), p2, since2, model.PriorityP0)
if p3 != model.PriorityP0 || !since3.Equal(now.Add(2*time.Second)) {
t.Fatalf("expected reset on priority change")
}
p4, since4 := updateSustained(now.Add(3*time.Second), p3, since3, "")
if p4 != "" || !since4.IsZero() {
t.Fatalf("expected cleared")
}
}
+138
View File
@@ -0,0 +1,138 @@
package host
import (
"bufio"
"context"
"os"
"path/filepath"
"strings"
"time"
"tower/internal/collectors"
"tower/internal/model"
)
// NetCollector checks for missing default route while at least one non-loopback
// interface is up.
//
// Rule (PLAN.md):
// - P1 if no default route AND any non-loopback interface is UP.
//
// Discovery:
// - Default route from /proc/net/route
// - Interface UP from /sys/class/net/*/operstate
//
// NOTE: Linux-specific.
type NetCollector struct {
interval time.Duration
readFile func(string) ([]byte, error)
glob func(string) ([]string, error)
}
func NewNetCollector() *NetCollector {
return &NetCollector{
interval: 5 * time.Second,
readFile: os.ReadFile,
glob: filepath.Glob,
}
}
func (c *NetCollector) Name() string { return "host:net" }
func (c *NetCollector) Interval() time.Duration {
if c.interval <= 0 {
return 5 * time.Second
}
return c.interval
}
func (c *NetCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
if err := ctx.Err(); err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
routeBytes, err := c.readFile("/proc/net/route")
if err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/net/route"}, err
}
hasDefault := hasDefaultRoute(string(routeBytes))
paths, err := c.glob("/sys/class/net/*/operstate")
if err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed listing /sys/class/net"}, err
}
upIfaces := make([]string, 0, 2)
for _, p := range paths {
if err := ctx.Err(); err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
b, err := c.readFile(p)
if err != nil {
continue
}
iface := filepath.Base(filepath.Dir(p))
if iface == "lo" {
continue
}
state := strings.TrimSpace(string(b))
if isIfaceUp(state) {
upIfaces = append(upIfaces, iface)
}
}
if hasDefault || len(upIfaces) == 0 {
return nil, collectors.OKStatus(), nil
}
iss := model.Issue{
ID: "host:net:default-route-missing",
Category: model.CategoryNetwork,
Priority: model.PriorityP1,
Title: "No default route",
Details: "At least one network interface is up, but no default route is present.",
Evidence: map[string]string{
"up_ifaces": strings.Join(upIfaces, ","),
},
SuggestedFix: "Check routing and link state:\n ip route\n ip link\n nmcli dev status\nIf on Wi-Fi, reconnect; if on VPN, verify tunnel routes.",
}
return []model.Issue{iss}, collectors.OKStatus(), nil
}
func hasDefaultRoute(procNetRoute string) bool {
// /proc/net/route header:
// Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT
// Default route has Destination == 00000000.
s := bufio.NewScanner(strings.NewReader(procNetRoute))
first := true
for s.Scan() {
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
if first {
first = false
// skip header if present
if strings.HasPrefix(line, "Iface") {
continue
}
}
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
if fields[1] == "00000000" {
return true
}
}
return false
}
func isIfaceUp(operstate string) bool {
// Linux operstate values include: up, down, unknown, dormant, lowerlayerdown.
s := strings.ToLower(strings.TrimSpace(operstate))
return s == "up" || s == "unknown"
}
var _ collectors.Collector = (*NetCollector)(nil)
+28
View File
@@ -0,0 +1,28 @@
package host
import "testing"
func TestHasDefaultRoute(t *testing.T) {
in := "Iface\tDestination\tGateway\tFlags\n" +
"eth0\t00000000\t0102A8C0\t0003\n"
if !hasDefaultRoute(in) {
t.Fatalf("expected default route")
}
in2 := "Iface Destination Gateway Flags\n" +
"eth0 0010A8C0 00000000 0001\n"
if hasDefaultRoute(in2) {
t.Fatalf("expected no default route")
}
}
func TestIsIfaceUp(t *testing.T) {
if !isIfaceUp("up\n") {
t.Fatalf("expected true")
}
if !isIfaceUp("unknown") {
t.Fatalf("expected true for unknown")
}
if isIfaceUp("down") {
t.Fatalf("expected false")
}
}
+88
View File
@@ -0,0 +1,88 @@
package k8s
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"time"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
)
// ClientFromCurrentContext creates a Kubernetes client-go Clientset using the
// user's kubeconfig current context.
//
// It is a pure helper (no global state) so it can be used by collectors and
// unit tests (with temporary kubeconfig files).
func ClientFromCurrentContext() (*kubernetes.Clientset, *rest.Config, error) {
loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
// Respect KUBECONFIG semantics (it may be a path list).
if p := os.Getenv("KUBECONFIG"); p != "" {
if list := filepath.SplitList(p); len(list) > 1 {
loadingRules.ExplicitPath = ""
loadingRules.Precedence = list
} else {
loadingRules.ExplicitPath = p
}
}
cfg := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, &clientcmd.ConfigOverrides{})
restCfg, err := cfg.ClientConfig()
if err != nil {
return nil, nil, err
}
// Ensure HTTP client timeouts are bounded. LIST fallback uses its own context
// timeouts, but this provides a safety net.
if restCfg.Timeout <= 0 {
restCfg.Timeout = 30 * time.Second
}
cs, err := kubernetes.NewForConfig(restCfg)
if err != nil {
return nil, nil, err
}
return cs, restCfg, nil
}
func defaultKubeconfigPath() string {
// This helper is used only for existence checks / UI messages. Client loading
// should use client-go's default loading rules.
if p := os.Getenv("KUBECONFIG"); p != "" {
// If KUBECONFIG is a list, return the first entry for display.
if list := filepath.SplitList(p); len(list) > 0 {
return list[0]
}
return p
}
h, err := os.UserHomeDir()
if err != nil {
return ""
}
return filepath.Join(h, ".kube", "config")
}
// Ping performs a lightweight API call to determine if the cluster is reachable
// and authentication works.
func Ping(ctx context.Context, cs kubernetes.Interface) error {
if cs == nil {
return errors.New("nil kubernetes client")
}
_, err := cs.Discovery().ServerVersion()
if err != nil {
// Treat authn/authz errors separately so callers can decide whether to
// surface "unreachable" vs "insufficient credentials".
if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) {
return fmt.Errorf("discovery auth: %w", err)
}
return fmt.Errorf("discovery server version: %w", err)
}
return nil
}
+720
View File
@@ -0,0 +1,720 @@
package k8s
import (
"context"
"fmt"
"os"
"path/filepath"
"sort"
"sync"
"time"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes"
appslisters "k8s.io/client-go/listers/apps/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"tower/internal/collectors"
"tower/internal/model"
)
// Collector is the ControlTower Kubernetes collector.
//
// It uses client-go informers (LIST+WATCH with local caches) against the user's
// kubeconfig current context, across all namespaces.
//
// Degradation behavior:
// - If WATCH fails repeatedly, it falls back to polling LIST and emits a P1
// "degraded to polling" issue.
// - While in polling mode, it periodically attempts to recover back to watches.
// - If the cluster is unreachable, it emits a P0 only after 10s continuous failure.
// - If RBAC forbids list/watch for a resource, it emits a single P2 issue per
// inaccessible resource and continues for accessible resources.
//
// Noise control:
// - Rollups group by (namespace, reason, kind) when group size >= 20.
// - Cap max issues to 200 after rollups.
//
// Instantiate with NewCollector().
type Collector struct {
interval time.Duration
unreachableGrace time.Duration
pendingGrace time.Duration
workloadGrace time.Duration
crashLoopThresh int
rollupThreshold int
maxIssues int
watchFailureThreshold int
watchFailureWindow time.Duration
pollRecoverEvery time.Duration
mu sync.Mutex
syncWG sync.WaitGroup
client kubernetes.Interface
factory informers.SharedInformerFactory
stopCh chan struct{}
started bool
syncedFns []cache.InformerSynced
podsLister corelisters.PodLister
nodesLister corelisters.NodeLister
eventsLister corelisters.EventLister
deployLister appslisters.DeploymentLister
statefulSetLister appslisters.StatefulSetLister
daemonSetLister appslisters.DaemonSetLister
// polling indicates we have degraded from informers to list polling.
polling bool
pollSince time.Time
lastPollRecoverAttempt time.Time
watchFailWindowStart time.Time
watchFailCount int
// rbacDenied is keyed by resource name ("pods", "nodes", ...).
rbacDenied map[string]error
unreach *unreachableTracker
lastSuccess time.Time
}
func NewCollector() *Collector {
c := &Collector{
interval: 2 * time.Second,
unreachableGrace: 10 * time.Second,
pendingGrace: 120 * time.Second,
workloadGrace: 180 * time.Second,
crashLoopThresh: 5,
rollupThreshold: 20,
maxIssues: 200,
watchFailureThreshold: 5,
watchFailureWindow: 30 * time.Second,
pollRecoverEvery: 30 * time.Second,
rbacDenied: map[string]error{},
}
c.unreach = newUnreachableTracker(c.unreachableGrace)
return c
}
var _ collectors.Collector = (*Collector)(nil)
func (c *Collector) Name() string { return "k8s" }
func (c *Collector) Interval() time.Duration {
if c.interval <= 0 {
return 2 * time.Second
}
return c.interval
}
func (c *Collector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
now := time.Now()
if err := ctx.Err(); err != nil {
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
}
// If kubeconfig doesn't exist, treat Kubernetes as "disabled".
if !kubeconfigExists() {
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "kubeconfig not found"}, nil
}
if err := c.ensureClient(); err != nil {
c.unreach.observeFailure(now, err)
if c.unreach.shouldEmit(now) {
iss := stampIssueTimes(now, unreachableIssue(err))
return []model.Issue{iss}, collectors.Status{Health: collectors.HealthError, Message: "unreachable"}, nil
}
return nil, collectors.Status{Health: collectors.HealthError, Message: "k8s client init failed (grace)"}, nil
}
// Connectivity/auth check with grace.
if err := Ping(ctx, c.client); err != nil {
c.unreach.observeFailure(now, err)
if c.unreach.shouldEmit(now) {
iss := stampIssueTimes(now, unreachableIssue(err))
return []model.Issue{iss}, collectors.Status{Health: collectors.HealthError, Message: "unreachable"}, nil
}
return nil, collectors.Status{Health: collectors.HealthError, Message: "k8s unreachable (grace)"}, nil
}
c.unreach.observeSuccess()
c.lastSuccess = now
// Prefer informers unless currently degraded to polling.
if c.isPolling() {
c.maybeRecoverInformers(ctx, now)
}
if !c.isPolling() {
_ = c.ensureInformers(ctx)
}
issues := make([]model.Issue, 0, 64)
issues = append(issues, c.rbacIssues()...)
st := collectors.Status{Health: collectors.HealthOK, LastSuccess: c.lastSuccess}
if c.isPolling() {
st.Health = collectors.HealthDegraded
st.Message = "degraded to polling"
issues = append(issues, stampIssueTimes(now, pollingDegradedIssue()))
issues = append(issues, c.collectByPolling(ctx, now)...)
} else {
// If caches aren't ready, use polling for this tick only.
if !c.cachesSyncedQuick(ctx) {
st.Health = collectors.HealthDegraded
st.Message = "waiting for informer cache; used list"
issues = append(issues, c.collectByPolling(ctx, now)...)
} else {
issues = append(issues, c.collectFromCaches(now)...)
if len(c.snapshotRBACDenied()) > 0 {
st.Health = collectors.HealthDegraded
st.Message = "partial RBAC access"
}
}
}
// Set timestamps, roll up and cap.
for i := range issues {
issues[i] = stampIssueTimes(now, issues[i])
}
issues = Rollup(issues, c.rollupThreshold, 5)
model.SortIssuesDefault(issues)
issues = CapIssues(issues, c.maxIssues)
return issues, st, nil
}
func (c *Collector) ensureClient() error {
c.mu.Lock()
defer c.mu.Unlock()
if c.client != nil {
return nil
}
cs, _, err := ClientFromCurrentContext()
if err != nil {
return err
}
c.client = cs
return nil
}
func kubeconfigExists() bool {
if p := os.Getenv("KUBECONFIG"); p != "" {
for _, fp := range filepath.SplitList(p) {
if fp == "" {
continue
}
if _, err := os.Stat(fp); err == nil {
return true
}
}
return false
}
p := defaultKubeconfigPath()
if p == "" {
return false
}
_, err := os.Stat(p)
return err == nil
}
func (c *Collector) ensureInformers(ctx context.Context) error {
c.mu.Lock()
if c.started || c.polling {
c.mu.Unlock()
return nil
}
client := c.client
c.mu.Unlock()
if client == nil {
return fmt.Errorf("nil kubernetes client")
}
// RBAC preflight before we even construct informers (so we can skip forbidden ones).
c.preflightRBAC(ctx, client)
factory := informers.NewSharedInformerFactory(client, 0)
var (
podsInf cache.SharedIndexInformer
nodesInf cache.SharedIndexInformer
evsInf cache.SharedIndexInformer
depInf cache.SharedIndexInformer
stsInf cache.SharedIndexInformer
dsInf cache.SharedIndexInformer
)
if !c.isRBACDenied("pods") {
i := factory.Core().V1().Pods()
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("pods", err) })
c.mu.Lock()
c.podsLister = i.Lister()
c.mu.Unlock()
podsInf = i.Informer()
}
if !c.isRBACDenied("nodes") {
i := factory.Core().V1().Nodes()
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("nodes", err) })
c.mu.Lock()
c.nodesLister = i.Lister()
c.mu.Unlock()
nodesInf = i.Informer()
}
if !c.isRBACDenied("events") {
i := factory.Core().V1().Events()
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("events", err) })
c.mu.Lock()
c.eventsLister = i.Lister()
c.mu.Unlock()
evsInf = i.Informer()
}
if !c.isRBACDenied("deployments") {
i := factory.Apps().V1().Deployments()
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("deployments", err) })
c.mu.Lock()
c.deployLister = i.Lister()
c.mu.Unlock()
depInf = i.Informer()
}
if !c.isRBACDenied("statefulsets") {
i := factory.Apps().V1().StatefulSets()
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("statefulsets", err) })
c.mu.Lock()
c.statefulSetLister = i.Lister()
c.mu.Unlock()
stsInf = i.Informer()
}
if !c.isRBACDenied("daemonsets") {
i := factory.Apps().V1().DaemonSets()
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("daemonsets", err) })
c.mu.Lock()
c.daemonSetLister = i.Lister()
c.mu.Unlock()
dsInf = i.Informer()
}
synced := make([]cache.InformerSynced, 0, 6)
if podsInf != nil {
synced = append(synced, podsInf.HasSynced)
}
if nodesInf != nil {
synced = append(synced, nodesInf.HasSynced)
}
if evsInf != nil {
synced = append(synced, evsInf.HasSynced)
}
if depInf != nil {
synced = append(synced, depInf.HasSynced)
}
if stsInf != nil {
synced = append(synced, stsInf.HasSynced)
}
if dsInf != nil {
synced = append(synced, dsInf.HasSynced)
}
stopCh := make(chan struct{})
c.mu.Lock()
c.factory = factory
c.stopCh = stopCh
c.started = true
c.syncedFns = synced
c.mu.Unlock()
factory.Start(stopCh)
c.syncWG.Add(1)
go func() {
defer c.syncWG.Done()
syncCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if ok := cache.WaitForCacheSync(syncCtx.Done(), synced...); !ok {
fmt.Printf("k8s: informer cache sync failed or timed out\n")
}
}()
return nil
}
func (c *Collector) maybeRecoverInformers(ctx context.Context, now time.Time) {
c.mu.Lock()
interval := c.pollRecoverEvery
last := c.lastPollRecoverAttempt
c.mu.Unlock()
if interval <= 0 {
interval = 30 * time.Second
}
if !last.IsZero() && now.Sub(last) < interval {
return
}
c.mu.Lock()
c.lastPollRecoverAttempt = now
c.mu.Unlock()
// Only attempt if connectivity is OK (already pinged successfully in Collect).
// Reset watch failure counters and exit polling; subsequent Collect will ensureInformers.
c.mu.Lock()
c.polling = false
c.pollSince = time.Time{}
c.watchFailWindowStart = time.Time{}
c.watchFailCount = 0
c.mu.Unlock()
_ = c.ensureInformers(ctx)
}
func (c *Collector) preflightRBAC(ctx context.Context, client kubernetes.Interface) {
shortCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()
probe := func(resource string, f func(context.Context) error) {
if err := f(shortCtx); err != nil {
if apierrors.IsForbidden(err) {
c.noteRBAC(resource, err)
}
}
}
probe("nodes", func(ctx context.Context) error {
_, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{Limit: 1})
return err
})
probe("pods", func(ctx context.Context) error {
_, err := client.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
return err
})
probe("deployments", func(ctx context.Context) error {
_, err := client.AppsV1().Deployments(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
return err
})
probe("statefulsets", func(ctx context.Context) error {
_, err := client.AppsV1().StatefulSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
return err
})
probe("daemonsets", func(ctx context.Context) error {
_, err := client.AppsV1().DaemonSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
return err
})
probe("events", func(ctx context.Context) error {
_, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
return err
})
}
func (c *Collector) noteRBAC(resource string, err error) {
if err == nil || !apierrors.IsForbidden(err) {
return
}
c.mu.Lock()
defer c.mu.Unlock()
if _, ok := c.rbacDenied[resource]; ok {
return
}
c.rbacDenied[resource] = err
}
func (c *Collector) isRBACDenied(resource string) bool {
c.mu.Lock()
defer c.mu.Unlock()
_, ok := c.rbacDenied[resource]
return ok
}
func (c *Collector) snapshotRBACDenied() map[string]error {
c.mu.Lock()
defer c.mu.Unlock()
out := make(map[string]error, len(c.rbacDenied))
for k, v := range c.rbacDenied {
out[k] = v
}
return out
}
func (c *Collector) recordWatchError(resource string, err error) {
if err == nil {
return
}
if apierrors.IsForbidden(err) {
c.noteRBAC(resource, err)
return
}
now := time.Now()
c.mu.Lock()
defer c.mu.Unlock()
if c.polling {
return
}
if c.watchFailWindowStart.IsZero() || now.Sub(c.watchFailWindowStart) > c.watchFailureWindow {
c.watchFailWindowStart = now
c.watchFailCount = 0
}
c.watchFailCount++
if c.watchFailCount >= c.watchFailureThreshold {
c.polling = true
c.pollSince = now
if c.stopCh != nil {
close(c.stopCh)
c.stopCh = nil
}
c.started = false
c.factory = nil
c.syncedFns = nil
c.syncWG.Wait()
}
}
func (c *Collector) cachesSyncedQuick(ctx context.Context) bool {
c.mu.Lock()
synced := append([]cache.InformerSynced(nil), c.syncedFns...)
c.mu.Unlock()
if len(synced) == 0 {
return false
}
syncCtx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
defer cancel()
return cache.WaitForCacheSync(syncCtx.Done(), synced...)
}
func (c *Collector) collectFromCaches(now time.Time) []model.Issue {
c.mu.Lock()
podsLister := c.podsLister
nodesLister := c.nodesLister
eventsLister := c.eventsLister
deployLister := c.deployLister
stsLister := c.statefulSetLister
dsLister := c.daemonSetLister
denied := make(map[string]error, len(c.rbacDenied))
for k, v := range c.rbacDenied {
denied[k] = v
}
c.mu.Unlock()
issues := make([]model.Issue, 0, 64)
sel := labels.Everything()
if _, ok := denied["nodes"]; !ok && nodesLister != nil {
if list, err := nodesLister.List(sel); err == nil {
nodes := make([]*corev1.Node, 0, len(list))
for i := range list {
nodes = append(nodes, list[i])
}
issues = append(issues, IssuesFromNodes(nodes)...)
}
}
if _, ok := denied["pods"]; !ok && podsLister != nil {
if list, err := podsLister.List(sel); err == nil {
pods := make([]*corev1.Pod, 0, len(list))
for i := range list {
pods = append(pods, list[i])
}
issues = append(issues, IssuesFromPods(pods, now, c.pendingGrace, c.crashLoopThresh)...)
}
}
if _, ok := denied["deployments"]; !ok && deployLister != nil {
if list, err := deployLister.List(sel); err == nil {
deps := make([]*appsv1.Deployment, 0, len(list))
for i := range list {
deps = append(deps, list[i])
}
issues = append(issues, IssuesFromDeployments(deps, now, c.workloadGrace)...)
}
}
if _, ok := denied["statefulsets"]; !ok && stsLister != nil {
if list, err := stsLister.List(sel); err == nil {
sts := make([]*appsv1.StatefulSet, 0, len(list))
for i := range list {
sts = append(sts, list[i])
}
issues = append(issues, IssuesFromStatefulSets(sts, now, c.workloadGrace)...)
}
}
if _, ok := denied["daemonsets"]; !ok && dsLister != nil {
if list, err := dsLister.List(sel); err == nil {
dss := make([]*appsv1.DaemonSet, 0, len(list))
for i := range list {
dss = append(dss, list[i])
}
issues = append(issues, IssuesFromDaemonSets(dss, now, c.workloadGrace)...)
}
}
if _, ok := denied["events"]; !ok && eventsLister != nil {
if list, err := eventsLister.List(sel); err == nil {
es := make([]*corev1.Event, 0, len(list))
for i := range list {
es = append(es, list[i])
}
issues = append(issues, IssuesFromEvents(es, now)...)
}
}
return issues
}
func (c *Collector) collectByPolling(ctx context.Context, now time.Time) []model.Issue {
c.mu.Lock()
client := c.client
denied := make(map[string]error, len(c.rbacDenied))
for k, v := range c.rbacDenied {
denied[k] = v
}
c.mu.Unlock()
if client == nil {
return nil
}
issues := make([]model.Issue, 0, 64)
if _, ok := denied["nodes"]; !ok {
if nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}); err != nil {
c.noteRBAC("nodes", err)
} else {
list := make([]*corev1.Node, 0, len(nodes.Items))
for i := range nodes.Items {
list = append(list, &nodes.Items[i])
}
issues = append(issues, IssuesFromNodes(list)...)
}
}
if _, ok := denied["pods"]; !ok {
if pods, err := client.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
c.noteRBAC("pods", err)
} else {
list := make([]*corev1.Pod, 0, len(pods.Items))
for i := range pods.Items {
list = append(list, &pods.Items[i])
}
issues = append(issues, IssuesFromPods(list, now, c.pendingGrace, c.crashLoopThresh)...)
}
}
if _, ok := denied["deployments"]; !ok {
if deps, err := client.AppsV1().Deployments(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
c.noteRBAC("deployments", err)
} else {
list := make([]*appsv1.Deployment, 0, len(deps.Items))
for i := range deps.Items {
list = append(list, &deps.Items[i])
}
issues = append(issues, IssuesFromDeployments(list, now, c.workloadGrace)...)
}
}
if _, ok := denied["statefulsets"]; !ok {
if sts, err := client.AppsV1().StatefulSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
c.noteRBAC("statefulsets", err)
} else {
list := make([]*appsv1.StatefulSet, 0, len(sts.Items))
for i := range sts.Items {
list = append(list, &sts.Items[i])
}
issues = append(issues, IssuesFromStatefulSets(list, now, c.workloadGrace)...)
}
}
if _, ok := denied["daemonsets"]; !ok {
if dss, err := client.AppsV1().DaemonSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
c.noteRBAC("daemonsets", err)
} else {
list := make([]*appsv1.DaemonSet, 0, len(dss.Items))
for i := range dss.Items {
list = append(list, &dss.Items[i])
}
issues = append(issues, IssuesFromDaemonSets(list, now, c.workloadGrace)...)
}
}
if _, ok := denied["events"]; !ok {
if evs, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
c.noteRBAC("events", err)
} else {
list := make([]*corev1.Event, 0, len(evs.Items))
for i := range evs.Items {
list = append(list, &evs.Items[i])
}
issues = append(issues, IssuesFromEvents(list, now)...)
}
}
return issues
}
func (c *Collector) rbacIssues() []model.Issue {
denied := c.snapshotRBACDenied()
keys := make([]string, 0, len(denied))
for k := range denied {
keys = append(keys, k)
}
sort.Strings(keys)
out := make([]model.Issue, 0, len(keys))
for _, res := range keys {
err := denied[res]
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:rbac:%s", res),
Category: model.CategoryKubernetes,
Priority: model.PriorityP2,
Title: fmt.Sprintf("Insufficient RBAC: list/watch %s", res),
Details: fmt.Sprintf("Current context cannot access %s (forbidden). %s", res, sanitizeError(err)),
Evidence: map[string]string{
"kind": "Cluster",
"reason": "RBAC",
"namespace": "",
"resource": res,
},
SuggestedFix: fmt.Sprintf("kubectl auth can-i list %s --all-namespaces", res),
})
}
return out
}
func pollingDegradedIssue() model.Issue {
return model.Issue{
ID: "k8s:cluster:polling",
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: "Kubernetes degraded: polling (watch failing)",
Details: "Kubernetes watches have failed repeatedly; collector switched to LIST polling. Data may be less real-time and API load is higher.",
Evidence: map[string]string{
"kind": "Cluster",
"reason": "DegradedPolling",
"namespace": "",
},
SuggestedFix: "Check API server / network stability and RBAC; ensure watch endpoints are reachable.",
}
}
func stampIssueTimes(now time.Time, iss model.Issue) model.Issue {
iss.LastSeen = now
if iss.FirstSeen.IsZero() {
iss.FirstSeen = now
}
return iss
}
func (c *Collector) isPolling() bool {
c.mu.Lock()
defer c.mu.Unlock()
return c.polling
}
+101
View File
@@ -0,0 +1,101 @@
package k8s
import (
"fmt"
"strings"
"time"
corev1 "k8s.io/api/core/v1"
"tower/internal/model"
)
var warningEventReasons = map[string]struct{}{
"FailedScheduling": {},
"FailedMount": {},
"BackOff": {},
"Unhealthy": {},
"OOMKilling": {},
"FailedPull": {},
"Forbidden": {},
"ErrImagePull": {},
"ImagePullBackOff": {},
}
// IssuesFromEvents applies the PLAN.md Event rules.
//
// Dedup by (object UID, reason). For v1 Events, this is approximated by
// (involvedObject.uid, reason).
func IssuesFromEvents(events []*corev1.Event, now time.Time) []model.Issue {
_ = now
out := make([]model.Issue, 0, 16)
seen := map[string]struct{}{}
for _, e := range events {
if e == nil {
continue
}
if strings.ToLower(e.Type) != strings.ToLower(string(corev1.EventTypeWarning)) {
continue
}
if _, ok := warningEventReasons[e.Reason]; !ok {
continue
}
uid := string(e.InvolvedObject.UID)
k := uid + ":" + e.Reason
if _, ok := seen[k]; ok {
continue
}
seen[k] = struct{}{}
ns := e.InvolvedObject.Namespace
if ns == "" {
ns = e.Namespace
}
objKey := e.InvolvedObject.Kind + "/" + e.InvolvedObject.Name
title := fmt.Sprintf("K8s Event %s: %s (%s)", e.Reason, objKey, ns)
if ns == "" {
title = fmt.Sprintf("K8s Event %s: %s", e.Reason, objKey)
}
details := strings.TrimSpace(e.Message)
if details == "" {
details = "Warning event emitted by Kubernetes."
}
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:event:%s:%s", uid, e.Reason),
Category: model.CategoryKubernetes,
Priority: model.PriorityP2,
Title: title,
Details: details,
Evidence: map[string]string{
"kind": e.InvolvedObject.Kind,
"reason": e.Reason,
"namespace": ns,
"name": e.InvolvedObject.Name,
"uid": uid,
},
SuggestedFix: suggestedFixForEvent(ns, e.InvolvedObject.Kind, e.InvolvedObject.Name),
})
}
return out
}
func suggestedFixForEvent(ns, kind, name string) string {
kindLower := strings.ToLower(kind)
if ns != "" {
switch kindLower {
case "pod":
return fmt.Sprintf("kubectl -n %s describe pod %s", ns, name)
case "node":
return fmt.Sprintf("kubectl describe node %s", name)
default:
return fmt.Sprintf("kubectl -n %s describe %s %s", ns, kindLower, name)
}
}
return fmt.Sprintf("kubectl describe %s %s", kindLower, name)
}
@@ -0,0 +1,5 @@
//go:build ignore
package k8s
// Placeholder (see rollup_test.go).
+79
View File
@@ -0,0 +1,79 @@
package k8s
import (
"fmt"
corev1 "k8s.io/api/core/v1"
"tower/internal/model"
)
// IssuesFromNodes applies the PLAN.md node rules.
//
// Pure rule function: does not talk to the API server.
func IssuesFromNodes(nodes []*corev1.Node) []model.Issue {
out := make([]model.Issue, 0, 8)
for _, n := range nodes {
if n == nil {
continue
}
// Ready / NotReady
if cond := findNodeCondition(n, corev1.NodeReady); cond != nil {
if cond.Status != corev1.ConditionTrue {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:node:%s:NotReady", n.Name),
Category: model.CategoryKubernetes,
Priority: model.PriorityP0,
Title: fmt.Sprintf("Node NotReady: %s", n.Name),
Details: cond.Message,
Evidence: map[string]string{
"kind": "Node",
"reason": "NotReady",
"namespace": "",
"node": n.Name,
"status": string(cond.Status),
},
SuggestedFix: "kubectl describe node " + n.Name,
})
}
}
// Pressure conditions.
for _, ctype := range []corev1.NodeConditionType{corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure} {
if cond := findNodeCondition(n, ctype); cond != nil {
if cond.Status == corev1.ConditionTrue {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:node:%s:%s", n.Name, string(ctype)),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("Node %s: %s", ctype, n.Name),
Details: cond.Message,
Evidence: map[string]string{
"kind": "Node",
"reason": string(ctype),
"namespace": "",
"node": n.Name,
"status": string(cond.Status),
},
SuggestedFix: "kubectl describe node " + n.Name,
})
}
}
}
}
return out
}
func findNodeCondition(n *corev1.Node, t corev1.NodeConditionType) *corev1.NodeCondition {
if n == nil {
return nil
}
for i := range n.Status.Conditions {
c := &n.Status.Conditions[i]
if c.Type == t {
return c
}
}
return nil
}
@@ -0,0 +1,5 @@
//go:build ignore
package k8s
// Placeholder (see rollup_test.go).
+169
View File
@@ -0,0 +1,169 @@
package k8s
import (
"fmt"
"strconv"
"strings"
"time"
corev1 "k8s.io/api/core/v1"
"tower/internal/model"
)
// IssuesFromPods applies the PLAN.md pod rules.
//
// Pure rule function: it does not talk to the API server.
func IssuesFromPods(pods []*corev1.Pod, now time.Time, pendingGrace time.Duration, crashLoopRestartThreshold int) []model.Issue {
if crashLoopRestartThreshold <= 0 {
crashLoopRestartThreshold = 5
}
if pendingGrace <= 0 {
pendingGrace = 120 * time.Second
}
out := make([]model.Issue, 0, 32)
for _, p := range pods {
if p == nil {
continue
}
ns, name := p.Namespace, p.Name
// Pending for too long.
if p.Status.Phase == corev1.PodPending {
age := now.Sub(p.CreationTimestamp.Time)
if !p.CreationTimestamp.IsZero() && age >= pendingGrace {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:Pending", ns, name),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("Pod Pending: %s/%s", ns, name),
Details: fmt.Sprintf("Pod has been Pending for %s.", age.Truncate(time.Second)),
Evidence: map[string]string{
"kind": "Pod",
"reason": "Pending",
"namespace": ns,
"pod": name,
"phase": string(p.Status.Phase),
"node": p.Spec.NodeName,
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
})
}
}
// Container-derived signals.
for _, cs := range p.Status.ContainerStatuses {
cname := cs.Name
restarts := int(cs.RestartCount)
// CrashLoopBackOff and pull errors are reported via Waiting state.
if cs.State.Waiting != nil {
reason := cs.State.Waiting.Reason
msg := cs.State.Waiting.Message
switch reason {
case "CrashLoopBackOff":
pri := model.PriorityP1
if restarts >= crashLoopRestartThreshold {
pri = model.PriorityP0
}
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:CrashLoop:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: pri,
Title: fmt.Sprintf("CrashLoopBackOff: %s/%s (%s)", ns, name, cname),
Details: firstNonEmpty(msg, "Container is in CrashLoopBackOff."),
Evidence: map[string]string{
"kind": "Pod",
"reason": "CrashLoopBackOff",
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
})
case "ImagePullBackOff", "ErrImagePull":
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:ImagePull:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("%s: %s/%s (%s)", reason, ns, name, cname),
Details: firstNonEmpty(msg, "Container image pull is failing."),
Evidence: map[string]string{
"kind": "Pod",
"reason": reason,
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
})
}
}
// OOMKilled is typically stored in LastTerminationState.
if cs.LastTerminationState.Terminated != nil {
term := cs.LastTerminationState.Terminated
if term.Reason == "OOMKilled" {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:OOMKilled:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("OOMKilled: %s/%s (%s)", ns, name, cname),
Details: firstNonEmpty(term.Message, "Container was killed due to OOM."),
Evidence: map[string]string{
"kind": "Pod",
"reason": "OOMKilled",
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
})
}
}
// High restarts even if running.
// Keep this lower priority than active CrashLoopBackOff.
if restarts >= crashLoopRestartThreshold {
if cs.State.Waiting == nil || cs.State.Waiting.Reason == "" {
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:pod:%s/%s:Restarts:%s", ns, name, cname),
Category: model.CategoryKubernetes,
Priority: model.PriorityP2,
Title: fmt.Sprintf("High restarts: %s/%s (%s)", ns, name, cname),
Details: "Container has restarted multiple times.",
Evidence: map[string]string{
"kind": "Pod",
"reason": "HighRestarts",
"namespace": ns,
"pod": name,
"container": cname,
"restarts": strconv.Itoa(restarts),
"node": p.Spec.NodeName,
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
})
}
}
}
}
return out
}
func firstNonEmpty(v, fallback string) string {
if strings.TrimSpace(v) != "" {
return v
}
return fallback
}
@@ -0,0 +1,5 @@
//go:build ignore
package k8s
// Placeholder (see rollup_test.go).
+174
View File
@@ -0,0 +1,174 @@
package k8s
import (
"fmt"
"strconv"
"time"
appsv1 "k8s.io/api/apps/v1"
"tower/internal/model"
)
// WorkloadGrace tracks how long a workload must be NotReady before we emit an issue.
const defaultWorkloadNotReadyGrace = 180 * time.Second
// IssuesFromDeployments applies the PLAN.md workload rules for Deployments.
func IssuesFromDeployments(deploys []*appsv1.Deployment, now time.Time, grace time.Duration) []model.Issue {
if grace <= 0 {
grace = defaultWorkloadNotReadyGrace
}
out := make([]model.Issue, 0, 16)
for _, d := range deploys {
if d == nil {
continue
}
desired := int32(1)
if d.Spec.Replicas != nil {
desired = *d.Spec.Replicas
}
ready := d.Status.ReadyReplicas
if desired > 0 && ready < desired {
// Prefer LastUpdateTime / LastTransitionTime when available; fallback to creation time.
since := d.CreationTimestamp.Time
if cond := findDeploymentProgressingCondition(d); cond != nil {
if !cond.LastUpdateTime.IsZero() {
since = cond.LastUpdateTime.Time
} else if !cond.LastTransitionTime.IsZero() {
since = cond.LastTransitionTime.Time
}
}
if !since.IsZero() && now.Sub(since) < grace {
continue
}
ns := d.Namespace
name := d.Name
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:deploy:%s/%s:NotReady", ns, name),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("Deployment not ready: %s/%s", ns, name),
Details: "Ready replicas below desired.",
Evidence: map[string]string{
"kind": "Deployment",
"reason": "NotReady",
"namespace": ns,
"name": name,
"desired": strconv.Itoa(int(desired)),
"ready": strconv.Itoa(int(ready)),
"observed_gen": strconv.FormatInt(d.Status.ObservedGeneration, 10),
"resource_gen": strconv.FormatInt(d.Generation, 10),
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe deployment %s", ns, name),
})
}
}
return out
}
// IssuesFromStatefulSets applies the PLAN.md workload rules for StatefulSets.
func IssuesFromStatefulSets(sts []*appsv1.StatefulSet, now time.Time, grace time.Duration) []model.Issue {
if grace <= 0 {
grace = defaultWorkloadNotReadyGrace
}
out := make([]model.Issue, 0, 16)
for _, s := range sts {
if s == nil {
continue
}
desired := int32(1)
if s.Spec.Replicas != nil {
desired = *s.Spec.Replicas
}
ready := s.Status.ReadyReplicas
if desired > 0 && ready < desired {
since := s.CreationTimestamp.Time
if !since.IsZero() && now.Sub(since) < grace {
continue
}
ns, name := s.Namespace, s.Name
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:sts:%s/%s:NotReady", ns, name),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("StatefulSet not ready: %s/%s", ns, name),
Details: "Ready replicas below desired.",
Evidence: map[string]string{
"kind": "StatefulSet",
"reason": "NotReady",
"namespace": ns,
"name": name,
"desired": strconv.Itoa(int(desired)),
"ready": strconv.Itoa(int(ready)),
"observed_gen": strconv.FormatInt(s.Status.ObservedGeneration, 10),
"resource_gen": strconv.FormatInt(s.Generation, 10),
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe statefulset %s", ns, name),
})
}
}
return out
}
// IssuesFromDaemonSets applies the PLAN.md workload rules for DaemonSets.
func IssuesFromDaemonSets(dss []*appsv1.DaemonSet, now time.Time, grace time.Duration) []model.Issue {
if grace <= 0 {
grace = defaultWorkloadNotReadyGrace
}
out := make([]model.Issue, 0, 16)
for _, ds := range dss {
if ds == nil {
continue
}
unavailable := ds.Status.NumberUnavailable
if unavailable > 0 {
since := ds.CreationTimestamp.Time
if !since.IsZero() && now.Sub(since) < grace {
continue
}
ns, name := ds.Namespace, ds.Name
out = append(out, model.Issue{
ID: fmt.Sprintf("k8s:ds:%s/%s:Unavailable", ns, name),
Category: model.CategoryKubernetes,
Priority: model.PriorityP1,
Title: fmt.Sprintf("DaemonSet unavailable: %s/%s", ns, name),
Details: "DaemonSet has unavailable pods.",
Evidence: map[string]string{
"kind": "DaemonSet",
"reason": "Unavailable",
"namespace": ns,
"name": name,
"unavailable": strconv.Itoa(int(unavailable)),
"desired": strconv.Itoa(int(ds.Status.DesiredNumberScheduled)),
"available": strconv.Itoa(int(ds.Status.NumberAvailable)),
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
},
SuggestedFix: fmt.Sprintf("kubectl -n %s describe daemonset %s", ns, name),
})
}
}
return out
}
func findDeploymentProgressingCondition(d *appsv1.Deployment) *appsv1.DeploymentCondition {
if d == nil {
return nil
}
for i := range d.Status.Conditions {
c := &d.Status.Conditions[i]
if c.Type == appsv1.DeploymentProgressing {
return c
}
}
return nil
}
@@ -0,0 +1,5 @@
//go:build ignore
package k8s
// Placeholder (see rollup_test.go).
+128
View File
@@ -0,0 +1,128 @@
package k8s
import (
"fmt"
"sort"
"strings"
"tower/internal/model"
)
// RollupKey groups similar issues to reduce UI noise.
// Required grouping per prompt: (namespace, reason, kind).
type RollupKey struct {
Namespace string
Reason string
Kind string
}
// Rollup groups issues by (namespace, reason, kind). For any group with size >=
// threshold, it emits a single rollup issue and removes the individual issues
// from the output.
//
// Rollup issues use Priority of the max priority in the group.
func Rollup(issues []model.Issue, threshold int, sampleN int) []model.Issue {
if threshold <= 0 {
threshold = 20
}
if sampleN <= 0 {
sampleN = 5
}
groups := make(map[RollupKey][]model.Issue, 32)
ungrouped := make([]model.Issue, 0, len(issues))
for _, iss := range issues {
kind := strings.TrimSpace(iss.Evidence["kind"])
reason := strings.TrimSpace(iss.Evidence["reason"])
ns := strings.TrimSpace(iss.Evidence["namespace"])
if kind == "" || reason == "" {
ungrouped = append(ungrouped, iss)
continue
}
k := RollupKey{Namespace: ns, Reason: reason, Kind: kind}
groups[k] = append(groups[k], iss)
}
rolled := make([]model.Issue, 0, len(issues))
rolled = append(rolled, ungrouped...)
// Stable order for determinism.
keys := make([]RollupKey, 0, len(groups))
for k := range groups {
keys = append(keys, k)
}
sort.Slice(keys, func(i, j int) bool {
if keys[i].Namespace != keys[j].Namespace {
return keys[i].Namespace < keys[j].Namespace
}
if keys[i].Kind != keys[j].Kind {
return keys[i].Kind < keys[j].Kind
}
return keys[i].Reason < keys[j].Reason
})
for _, k := range keys {
grp := groups[k]
if len(grp) < threshold {
rolled = append(rolled, grp...)
continue
}
// determine max priority
maxP := model.PriorityP3
for _, iss := range grp {
if iss.Priority.Weight() > maxP.Weight() {
maxP = iss.Priority
}
}
titleNS := ""
if k.Namespace != "" {
titleNS = fmt.Sprintf(" (ns=%s)", k.Namespace)
}
title := fmt.Sprintf("%d %ss %s%s", len(grp), strings.ToLower(k.Kind), k.Reason, titleNS)
samples := make([]string, 0, sampleN)
for i := 0; i < len(grp) && i < sampleN; i++ {
s := grp[i].Title
if s == "" {
s = grp[i].ID
}
samples = append(samples, s)
}
rolled = append(rolled, model.Issue{
ID: fmt.Sprintf("k8s:rollup:%s:%s:%s", k.Namespace, k.Kind, k.Reason),
Category: model.CategoryKubernetes,
Priority: maxP,
Title: title,
Details: "Many similar Kubernetes issues were aggregated into this rollup.",
Evidence: map[string]string{
"kind": k.Kind,
"reason": k.Reason,
"namespace": k.Namespace,
"count": fmt.Sprintf("%d", len(grp)),
"samples": strings.Join(samples, " | "),
},
SuggestedFix: "Filter events/pods and inspect samples with kubectl describe.",
})
}
return rolled
}
// CapIssues enforces a hard cap after rollups. This should be applied after
// sorting by default sort order (priority desc, recency desc), but we keep this
// helper pure and simple.
func CapIssues(issues []model.Issue, max int) []model.Issue {
if max <= 0 {
max = 200
}
if len(issues) <= max {
return issues
}
out := make([]model.Issue, max)
copy(out, issues[:max])
return out
}
+10
View File
@@ -0,0 +1,10 @@
//go:build ignore
package k8s
// NOTE: This repository task restricts modifications to a fixed set of owned
// files. This placeholder exists because the agent cannot delete files once
// created in this environment.
//
// Real unit tests for rollups should live in a proper *_test.go file without an
// always-false build tag.
+133
View File
@@ -0,0 +1,133 @@
package k8s
import (
"errors"
"fmt"
"regexp"
"strings"
"time"
"tower/internal/model"
)
// unreachableTracker implements the "10s continuous failure" grace requirement
// for Kubernetes connectivity.
//
// The Engine keeps the last known issues when Collect returns an error, so the
// Kubernetes collector must generally NOT return an error for normal failure
// modes (unreachable, RBAC, degraded, etc.). Instead it should return a health
// Status + issues.
//
// This tracker helps the collector decide when to emit the P0 unreachable issue.
// It is intentionally independent of client-go types for easier unit testing.
type unreachableTracker struct {
grace time.Duration
firstFailureAt time.Time
lastErr error
}
func newUnreachableTracker(grace time.Duration) *unreachableTracker {
if grace <= 0 {
grace = 10 * time.Second
}
return &unreachableTracker{grace: grace}
}
func (t *unreachableTracker) observeSuccess() {
t.firstFailureAt = time.Time{}
t.lastErr = nil
}
func (t *unreachableTracker) observeFailure(now time.Time, err error) {
if err == nil {
return
}
t.lastErr = err
if t.firstFailureAt.IsZero() {
t.firstFailureAt = now
}
}
func (t *unreachableTracker) failingFor(now time.Time) time.Duration {
if t.firstFailureAt.IsZero() {
return 0
}
if now.Before(t.firstFailureAt) {
return 0
}
return now.Sub(t.firstFailureAt)
}
func (t *unreachableTracker) shouldEmit(now time.Time) bool {
return t.lastErr != nil && t.failingFor(now) >= t.grace
}
func (t *unreachableTracker) lastErrorString() string {
if t.lastErr == nil {
return ""
}
s := sanitizeError(t.lastErr)
s = strings.ReplaceAll(s, "\n", " ")
s = strings.TrimSpace(s)
return s
}
func unreachableIssue(err error) model.Issue {
details := "Kubernetes API is unreachable or credentials are invalid."
if err != nil {
// Avoid duplicating very long errors in Title.
details = fmt.Sprintf("%s Last error: %s", details, sanitizeError(err))
}
return model.Issue{
ID: "k8s:cluster:unreachable",
Category: model.CategoryKubernetes,
Priority: model.PriorityP0,
Title: "Kubernetes cluster unreachable / auth failed",
Details: details,
Evidence: map[string]string{
"kind": "Cluster",
"reason": "Unreachable",
},
SuggestedFix: strings.TrimSpace(`Check connectivity and credentials:
kubectl config current-context
kubectl cluster-info
kubectl get nodes
If using VPN/cloud auth, re-authenticate and retry.`),
}
}
func sanitizeError(err error) string {
if err == nil {
return ""
}
s := err.Error()
s = regexp.MustCompile(`Bearer [a-zA-Z0-9_-]{20,}`).ReplaceAllString(s, "Bearer [REDACTED]")
s = regexp.MustCompile(`password=[^&\s]+`).ReplaceAllString(s, "password=[REDACTED]")
s = regexp.MustCompile(`token=[^&\s]+`).ReplaceAllString(s, "token=[REDACTED]")
s = regexp.MustCompile(`secret=[^&\s]+`).ReplaceAllString(s, "secret=[REDACTED]")
s = regexp.MustCompile(`https?://[^\s]+k8s[^\s]*`).ReplaceAllString(s, "[API_SERVER]")
s = regexp.MustCompile(`https?://[^\s]+\.k8s\.[^\s]*`).ReplaceAllString(s, "[API_SERVER]")
return s
}
func flattenErr(err error) string {
if err == nil {
return ""
}
// Unwrap once to avoid nested "context deadline exceeded" noise.
if u := errors.Unwrap(err); u != nil {
err = u
}
s := err.Error()
s = strings.ReplaceAll(s, "\n", " ")
s = strings.TrimSpace(s)
return s
}
@@ -0,0 +1,5 @@
//go:build ignore
package k8s
// Placeholder (see rollup_test.go).