feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
package collectors
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
type Health string
|
||||
|
||||
const (
|
||||
HealthOK Health = "OK"
|
||||
HealthDegraded Health = "DEGRADED"
|
||||
HealthError Health = "ERROR"
|
||||
)
|
||||
|
||||
// Status describes collector health for the current tick.
|
||||
//
|
||||
// Collectors should return Status even when returning an error,
|
||||
// so the UI can show useful context.
|
||||
//
|
||||
// LastSuccess should be the collector's most recent successful collect time.
|
||||
// When unknown, it may be the zero value.
|
||||
//
|
||||
// Message should be short and human-friendly.
|
||||
type Status struct {
|
||||
Health Health `json:"health"`
|
||||
Message string `json:"message,omitempty"`
|
||||
LastSuccess time.Time `json:"last_success,omitempty"`
|
||||
}
|
||||
|
||||
func OKStatus() Status {
|
||||
return Status{Health: HealthOK}
|
||||
}
|
||||
|
||||
// Collector returns "currently true" issues for this tick.
|
||||
//
|
||||
// The store is responsible for dedupe, lifecycle, and resolve-after.
|
||||
// Collectors must respect ctx cancellation.
|
||||
type Collector interface {
|
||||
Name() string
|
||||
Interval() time.Duration
|
||||
Collect(ctx context.Context) ([]model.Issue, Status, error)
|
||||
}
|
||||
@@ -0,0 +1,287 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// DiskCollector checks filesystem block + inode pressure across mounts.
|
||||
//
|
||||
// It reads /proc/mounts to discover mounts and then uses statfs to compute usage.
|
||||
// Pseudo filesystems are filtered out.
|
||||
//
|
||||
// Thresholds (PLAN.md):
|
||||
// - P1 if blocks OR inodes >= 92%
|
||||
// - P0 if blocks OR inodes >= 98%
|
||||
//
|
||||
// Issues are emitted per mount (one issue that includes both block+inode usage).
|
||||
//
|
||||
// NOTE: This collector is Linux-specific.
|
||||
type DiskCollector struct {
|
||||
interval time.Duration
|
||||
|
||||
readFile func(string) ([]byte, error)
|
||||
statfs func(path string, st *syscall.Statfs_t) error
|
||||
}
|
||||
|
||||
func NewDiskCollector() *DiskCollector {
|
||||
return &DiskCollector{
|
||||
interval: 10 * time.Second,
|
||||
readFile: os.ReadFile,
|
||||
statfs: syscall.Statfs,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *DiskCollector) Name() string { return "host:disk" }
|
||||
|
||||
func (c *DiskCollector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 10 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *DiskCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
b, err := c.readFile("/proc/mounts")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/mounts"}, err
|
||||
}
|
||||
|
||||
mounts := parseProcMounts(string(b))
|
||||
if len(mounts) == 0 {
|
||||
// Unusual but treat as degraded rather than hard error.
|
||||
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "no mounts found"}, nil
|
||||
}
|
||||
|
||||
issues := make([]model.Issue, 0, 8)
|
||||
seenMount := map[string]struct{}{}
|
||||
|
||||
partialErrs := 0
|
||||
for _, m := range mounts {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return issues, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
if shouldSkipMount(m) {
|
||||
continue
|
||||
}
|
||||
if _, ok := seenMount[m.MountPoint]; ok {
|
||||
continue
|
||||
}
|
||||
seenMount[m.MountPoint] = struct{}{}
|
||||
|
||||
var st syscall.Statfs_t
|
||||
if err := c.statfs(m.MountPoint, &st); err != nil {
|
||||
partialErrs++
|
||||
continue
|
||||
}
|
||||
|
||||
blockPct, blockFreeBytes := statfsBlockUsedPct(st)
|
||||
inodePct := statfsInodeUsedPct(st)
|
||||
|
||||
pri, ok := diskPriority(blockPct, inodePct)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
evidence := map[string]string{
|
||||
"mount": m.MountPoint,
|
||||
"fstype": m.FSType,
|
||||
"block_used_pct": fmt.Sprintf("%.1f", blockPct),
|
||||
"block_free_bytes": strconv.FormatUint(blockFreeBytes, 10),
|
||||
}
|
||||
if inodePct >= 0 {
|
||||
evidence["inode_used_pct"] = fmt.Sprintf("%.1f", inodePct)
|
||||
}
|
||||
|
||||
issues = append(issues, model.Issue{
|
||||
ID: fmt.Sprintf("host:disk:%s:usage", m.MountPoint),
|
||||
Category: model.CategoryStorage,
|
||||
Priority: pri,
|
||||
Title: fmt.Sprintf("Disk usage high on %s", m.MountPoint),
|
||||
Details: "Filesystem space and/or inodes are nearly exhausted.",
|
||||
Evidence: evidence,
|
||||
SuggestedFix: fmt.Sprintf(
|
||||
"Inspect usage:\n df -h %s\n df -i %s\nFind large directories:\n sudo du -xh --max-depth=2 %s | sort -h | tail",
|
||||
m.MountPoint, m.MountPoint, m.MountPoint,
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
st := collectors.OKStatus()
|
||||
if partialErrs > 0 {
|
||||
st.Health = collectors.HealthDegraded
|
||||
st.Message = fmt.Sprintf("partial failures: %d mounts", partialErrs)
|
||||
}
|
||||
return issues, st, nil
|
||||
}
|
||||
|
||||
type procMount struct {
|
||||
Device string
|
||||
MountPoint string
|
||||
FSType string
|
||||
Options string
|
||||
}
|
||||
|
||||
func parseProcMounts(content string) []procMount {
|
||||
s := bufio.NewScanner(strings.NewReader(content))
|
||||
out := make([]procMount, 0, 32)
|
||||
for s.Scan() {
|
||||
line := strings.TrimSpace(s.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 3 {
|
||||
continue
|
||||
}
|
||||
m := procMount{
|
||||
Device: unescapeProcMountsField(fields[0]),
|
||||
MountPoint: unescapeProcMountsField(fields[1]),
|
||||
FSType: fields[2],
|
||||
}
|
||||
if len(fields) >= 4 {
|
||||
m.Options = fields[3]
|
||||
}
|
||||
out = append(out, m)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// /proc/mounts escapes special characters as octal sequences.
|
||||
// The most common one is a space as \040.
|
||||
func unescapeProcMountsField(s string) string {
|
||||
replacer := strings.NewReplacer(
|
||||
"\\040", " ",
|
||||
"\\011", "\t",
|
||||
"\\012", "\n",
|
||||
"\\134", "\\",
|
||||
)
|
||||
return replacer.Replace(s)
|
||||
}
|
||||
|
||||
var pseudoFSTypes = map[string]struct{}{
|
||||
"proc": {},
|
||||
"sysfs": {},
|
||||
"tmpfs": {},
|
||||
"devtmpfs": {},
|
||||
"devpts": {},
|
||||
"cgroup": {},
|
||||
"cgroup2": {},
|
||||
"pstore": {},
|
||||
"securityfs": {},
|
||||
"debugfs": {},
|
||||
"tracefs": {},
|
||||
"configfs": {},
|
||||
"hugetlbfs": {},
|
||||
"mqueue": {},
|
||||
"rpc_pipefs": {},
|
||||
"fusectl": {},
|
||||
"binfmt_misc": {},
|
||||
"autofs": {},
|
||||
"bpf": {},
|
||||
"ramfs": {},
|
||||
"nsfs": {},
|
||||
"efivarfs": {},
|
||||
"overlay": {}, // common container overlay mounts
|
||||
|
||||
"squashfs": {}, // typically read-only images
|
||||
"selinuxfs": {},
|
||||
"systemd-1": {},
|
||||
"overlayfs": {}, // (non-standard) conservative skip
|
||||
|
||||
"cgroupfs": {},
|
||||
"procfs": {},
|
||||
"fuse.lxcfs": {},
|
||||
"fuse.gvfsd-fuse": {},
|
||||
}
|
||||
|
||||
func shouldSkipMount(m procMount) bool {
|
||||
if m.MountPoint == "" {
|
||||
return true
|
||||
}
|
||||
// Filter by fstype.
|
||||
if _, ok := pseudoFSTypes[m.FSType]; ok {
|
||||
return true
|
||||
}
|
||||
// Filter common pseudo mountpoints.
|
||||
if strings.HasPrefix(m.MountPoint, "/proc") || strings.HasPrefix(m.MountPoint, "/sys") {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(m.MountPoint, "/dev") {
|
||||
// /dev itself can be a real mount in some cases, but usually isn't useful for disk pressure.
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func statfsBlockUsedPct(st syscall.Statfs_t) (usedPct float64, freeBytes uint64) {
|
||||
// Mirror df(1) semantics closely:
|
||||
// total = f_blocks
|
||||
// used = f_blocks - f_bfree
|
||||
// avail = f_bavail (space available to unprivileged user)
|
||||
// use% = used / (used + avail)
|
||||
if st.Blocks == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
bsize := uint64(st.Bsize)
|
||||
blocks := uint64(st.Blocks)
|
||||
bfree := uint64(st.Bfree)
|
||||
bavail := uint64(st.Bavail)
|
||||
|
||||
usedBlocks := blocks - bfree
|
||||
denom := usedBlocks + bavail
|
||||
if denom == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
freeBytes = bavail * bsize
|
||||
usedPct = (float64(usedBlocks) / float64(denom)) * 100.0
|
||||
return usedPct, freeBytes
|
||||
}
|
||||
|
||||
// statfsInodeUsedPct returns inode used percent. If inodes are unavailable (f_files==0), returns -1.
|
||||
func statfsInodeUsedPct(st syscall.Statfs_t) float64 {
|
||||
if st.Files == 0 {
|
||||
return -1
|
||||
}
|
||||
total := float64(st.Files)
|
||||
free := float64(st.Ffree)
|
||||
used := total - free
|
||||
return (used / total) * 100.0
|
||||
}
|
||||
|
||||
func diskPriority(blockPct, inodePct float64) (model.Priority, bool) {
|
||||
maxPct := blockPct
|
||||
if inodePct > maxPct {
|
||||
maxPct = inodePct
|
||||
}
|
||||
// inodePct may be -1 if not supported; ignore in that case.
|
||||
if inodePct < 0 {
|
||||
maxPct = blockPct
|
||||
}
|
||||
|
||||
switch {
|
||||
case maxPct >= 98.0:
|
||||
return model.PriorityP0, true
|
||||
case maxPct >= 92.0:
|
||||
return model.PriorityP1, true
|
||||
default:
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*DiskCollector)(nil)
|
||||
@@ -0,0 +1,80 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseProcMounts_UnescapesAndParses(t *testing.T) {
|
||||
in := "dev1 / ext4 rw 0 0\n" +
|
||||
"dev2 /path\\040with\\040space xfs rw 0 0\n" +
|
||||
"badline\n"
|
||||
|
||||
ms := parseProcMounts(in)
|
||||
if len(ms) != 2 {
|
||||
t.Fatalf("expected 2 mounts, got %d", len(ms))
|
||||
}
|
||||
if ms[0].MountPoint != "/" || ms[0].FSType != "ext4" {
|
||||
t.Fatalf("unexpected first mount: %+v", ms[0])
|
||||
}
|
||||
if ms[1].MountPoint != "/path with space" {
|
||||
t.Fatalf("expected unescaped mountpoint, got %q", ms[1].MountPoint)
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldSkipMount_FiltersPseudo(t *testing.T) {
|
||||
cases := []procMount{
|
||||
{MountPoint: "/proc", FSType: "proc"},
|
||||
{MountPoint: "/sys", FSType: "sysfs"},
|
||||
{MountPoint: "/dev", FSType: "tmpfs"},
|
||||
{MountPoint: "/dev/shm", FSType: "tmpfs"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
if !shouldSkipMount(c) {
|
||||
t.Fatalf("expected skip for %+v", c)
|
||||
}
|
||||
}
|
||||
if shouldSkipMount(procMount{MountPoint: "/home", FSType: "ext4"}) {
|
||||
t.Fatalf("did not expect skip for /home ext4")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiskPriority(t *testing.T) {
|
||||
if p, ok := diskPriority(91.9, -1); ok {
|
||||
t.Fatalf("expected no issue, got %v", p)
|
||||
}
|
||||
if p, ok := diskPriority(92.0, -1); !ok || p != "P1" {
|
||||
t.Fatalf("expected P1 at 92%%, got %v ok=%v", p, ok)
|
||||
}
|
||||
if p, ok := diskPriority(97.9, 98.0); !ok || p != "P0" {
|
||||
t.Fatalf("expected P0 if either crosses 98%%, got %v ok=%v", p, ok)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStatfsCalculations(t *testing.T) {
|
||||
st := syscall.Statfs_t{}
|
||||
st.Bsize = 1
|
||||
st.Blocks = 100
|
||||
st.Bfree = 8
|
||||
st.Bavail = 8
|
||||
|
||||
pct, free := statfsBlockUsedPct(st)
|
||||
if free != 8 {
|
||||
t.Fatalf("expected free=8 bytes, got %d", free)
|
||||
}
|
||||
if pct < 91.9 || pct > 92.1 {
|
||||
t.Fatalf("expected ~92%% used, got %f", pct)
|
||||
}
|
||||
|
||||
st.Files = 100
|
||||
st.Ffree = 2
|
||||
ipct := statfsInodeUsedPct(st)
|
||||
if ipct < 97.9 || ipct > 98.1 {
|
||||
t.Fatalf("expected ~98%% inode used, got %f", ipct)
|
||||
}
|
||||
|
||||
st.Files = 0
|
||||
if statfsInodeUsedPct(st) != -1 {
|
||||
t.Fatalf("expected -1 when inode info unavailable")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// LoadCollector evaluates 1-minute load average normalized by logical CPU count.
|
||||
//
|
||||
// Thresholds (PLAN.md), normalized by CPU count:
|
||||
// - P2 if load1/cpus >= 4.0 sustained 120s
|
||||
// - P1 if load1/cpus >= 6.0 sustained 120s
|
||||
//
|
||||
// NOTE: Linux-specific.
|
||||
// Thread-safe: Collect() can be called concurrently.
|
||||
type LoadCollector struct {
|
||||
interval time.Duration
|
||||
|
||||
now func() time.Time
|
||||
readFile func(string) ([]byte, error)
|
||||
cpuCount func() int
|
||||
|
||||
mu sync.Mutex
|
||||
|
||||
pri model.Priority
|
||||
since time.Time
|
||||
}
|
||||
|
||||
func NewLoadCollector() *LoadCollector {
|
||||
return &LoadCollector{
|
||||
interval: 5 * time.Second,
|
||||
now: time.Now,
|
||||
readFile: os.ReadFile,
|
||||
cpuCount: runtime.NumCPU,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *LoadCollector) Name() string { return "host:load" }
|
||||
|
||||
func (c *LoadCollector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 5 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *LoadCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
now := c.now()
|
||||
b, err := c.readFile("/proc/loadavg")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/loadavg"}, err
|
||||
}
|
||||
|
||||
load1, err := parseProcLoadavgFirst(string(b))
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "bad /proc/loadavg"}, nil
|
||||
}
|
||||
|
||||
cpus := c.cpuCount()
|
||||
if cpus <= 0 {
|
||||
cpus = 1
|
||||
}
|
||||
norm := load1 / float64(cpus)
|
||||
desired, window := desiredLoadPriority(norm)
|
||||
c.mu.Lock()
|
||||
c.pri, c.since = updateSustained(now, c.pri, c.since, desired)
|
||||
pri, since := c.pri, c.since
|
||||
c.mu.Unlock()
|
||||
|
||||
if pri == "" || since.IsZero() || now.Sub(since) < window {
|
||||
return nil, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
iss := model.Issue{
|
||||
ID: "host:load:high",
|
||||
Category: model.CategoryPerformance,
|
||||
Priority: pri,
|
||||
Title: "High sustained system load",
|
||||
Details: "The 1-minute load average is high relative to CPU count for a sustained period.",
|
||||
Evidence: map[string]string{
|
||||
"load1": fmt.Sprintf("%.2f", load1),
|
||||
"cpus": strconv.Itoa(cpus),
|
||||
"load1_per_cpu": fmt.Sprintf("%.2f", norm),
|
||||
"sustained_window": window.String(),
|
||||
},
|
||||
SuggestedFix: "Investigate CPU hogs:\n top\n ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head\nIf I/O bound (high iowait), check disk/network.\n",
|
||||
}
|
||||
return []model.Issue{iss}, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
func parseProcLoadavgFirst(content string) (float64, error) {
|
||||
// /proc/loadavg format: "1.23 0.70 0.50 1/123 4567".
|
||||
fields := strings.Fields(content)
|
||||
if len(fields) < 1 {
|
||||
return 0, fmt.Errorf("missing fields")
|
||||
}
|
||||
v, err := strconv.ParseFloat(fields[0], 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return v, nil
|
||||
}
|
||||
|
||||
func desiredLoadPriority(loadPerCPU float64) (model.Priority, time.Duration) {
|
||||
if loadPerCPU >= 6.0 {
|
||||
return model.PriorityP1, 120 * time.Second
|
||||
}
|
||||
if loadPerCPU >= 4.0 {
|
||||
return model.PriorityP2, 120 * time.Second
|
||||
}
|
||||
return "", 0
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*LoadCollector)(nil)
|
||||
@@ -0,0 +1,48 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
func TestParseProcLoadavgFirst(t *testing.T) {
|
||||
v, err := parseProcLoadavgFirst("1.23 0.70 0.50 1/123 4567\n")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected err: %v", err)
|
||||
}
|
||||
if v < 1.229 || v > 1.231 {
|
||||
t.Fatalf("expected 1.23, got %v", v)
|
||||
}
|
||||
if _, err := parseProcLoadavgFirst("\n"); err == nil {
|
||||
t.Fatalf("expected error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDesiredLoadPriority(t *testing.T) {
|
||||
p, w := desiredLoadPriority(3.99)
|
||||
if p != "" || w != 0 {
|
||||
t.Fatalf("expected none")
|
||||
}
|
||||
p, w = desiredLoadPriority(4.0)
|
||||
if p != model.PriorityP2 || w != 120*time.Second {
|
||||
t.Fatalf("expected P2/120s")
|
||||
}
|
||||
p, w = desiredLoadPriority(6.0)
|
||||
if p != model.PriorityP1 || w != 120*time.Second {
|
||||
t.Fatalf("expected P1/120s")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateSustainedWorksForLoadToo(t *testing.T) {
|
||||
now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
|
||||
p, since := updateSustained(now, "", time.Time{}, model.PriorityP2)
|
||||
if p != model.PriorityP2 || !since.Equal(now) {
|
||||
t.Fatalf("expected set")
|
||||
}
|
||||
p2, since2 := updateSustained(now.Add(10*time.Second), p, since, model.PriorityP2)
|
||||
if p2 != model.PriorityP2 || !since2.Equal(since) {
|
||||
t.Fatalf("expected unchanged")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// MemCollector checks MemAvailable and swap pressure from /proc/meminfo.
|
||||
//
|
||||
// Thresholds (PLAN.md):
|
||||
// Memory (MemAvailable as % of MemTotal):
|
||||
// - P2 if <= 15% sustained 60s
|
||||
// - P1 if <= 10% sustained 60s
|
||||
// - P0 if <= 5% sustained 30s
|
||||
//
|
||||
// Swap pressure (only if RAM is also tight):
|
||||
// - P1 if swap used >= 50% AND MemAvailable <= 10% sustained 60s
|
||||
// - P0 if swap used >= 80% AND MemAvailable <= 5% sustained 30s
|
||||
//
|
||||
// Emits up to two issues:
|
||||
// - host:mem:available
|
||||
// - host:mem:swap
|
||||
//
|
||||
// NOTE: Linux-specific.
|
||||
// Thread-safe: Collect() can be called concurrently.
|
||||
type MemCollector struct {
|
||||
interval time.Duration
|
||||
|
||||
now func() time.Time
|
||||
readFile func(string) ([]byte, error)
|
||||
|
||||
mu sync.Mutex
|
||||
|
||||
memPri model.Priority
|
||||
memSince time.Time
|
||||
|
||||
swapPri model.Priority
|
||||
swapSince time.Time
|
||||
}
|
||||
|
||||
func NewMemCollector() *MemCollector {
|
||||
return &MemCollector{
|
||||
interval: 5 * time.Second,
|
||||
now: time.Now,
|
||||
readFile: os.ReadFile,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *MemCollector) Name() string { return "host:mem" }
|
||||
|
||||
func (c *MemCollector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 5 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *MemCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
now := c.now()
|
||||
b, err := c.readFile("/proc/meminfo")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/meminfo"}, err
|
||||
}
|
||||
|
||||
mi := parseProcMeminfo(string(b))
|
||||
memTotalKB, okT := mi["MemTotal"]
|
||||
memAvailKB, okA := mi["MemAvailable"]
|
||||
if !okT || !okA || memTotalKB <= 0 {
|
||||
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "missing MemTotal/MemAvailable"}, nil
|
||||
}
|
||||
|
||||
memAvailPct := (float64(memAvailKB) / float64(memTotalKB)) * 100.0
|
||||
|
||||
desiredMemPri, memWindow := desiredMemPriority(memAvailPct)
|
||||
c.mu.Lock()
|
||||
c.memPri, c.memSince = updateSustained(now, c.memPri, c.memSince, desiredMemPri)
|
||||
memPri, memSince := c.memPri, c.memSince
|
||||
c.mu.Unlock()
|
||||
|
||||
issues := make([]model.Issue, 0, 2)
|
||||
if memPri != "" && !memSince.IsZero() && now.Sub(memSince) >= memWindow {
|
||||
issues = append(issues, model.Issue{
|
||||
ID: "host:mem:available",
|
||||
Category: model.CategoryMemory,
|
||||
Priority: memPri,
|
||||
Title: "Low available memory",
|
||||
Details: "MemAvailable is low and has remained low for a sustained period.",
|
||||
Evidence: map[string]string{
|
||||
"mem_available_kb": strconv.FormatInt(memAvailKB, 10),
|
||||
"mem_total_kb": strconv.FormatInt(memTotalKB, 10),
|
||||
"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
|
||||
},
|
||||
SuggestedFix: "Identify memory hogs:\n free -h\n ps aux --sort=-rss | head\nConsider restarting runaway processes or adding RAM.",
|
||||
})
|
||||
}
|
||||
|
||||
swapTotalKB, okST := mi["SwapTotal"]
|
||||
swapFreeKB, okSF := mi["SwapFree"]
|
||||
swapUsedPct := 0.0
|
||||
if okST && okSF && swapTotalKB > 0 {
|
||||
swapUsedKB := swapTotalKB - swapFreeKB
|
||||
swapUsedPct = (float64(swapUsedKB) / float64(swapTotalKB)) * 100.0
|
||||
}
|
||||
|
||||
desiredSwapPri, swapWindow := desiredSwapPriority(memAvailPct, swapTotalKB, swapUsedPct)
|
||||
c.mu.Lock()
|
||||
c.swapPri, c.swapSince = updateSustained(now, c.swapPri, c.swapSince, desiredSwapPri)
|
||||
swapPri, swapSince := c.swapPri, c.swapSince
|
||||
c.mu.Unlock()
|
||||
if swapPri != "" && !swapSince.IsZero() && now.Sub(swapSince) >= swapWindow {
|
||||
issues = append(issues, model.Issue{
|
||||
ID: "host:mem:swap",
|
||||
Category: model.CategoryMemory,
|
||||
Priority: swapPri,
|
||||
Title: "High swap usage with low RAM",
|
||||
Details: "Swap usage is high while available RAM is also low, indicating memory pressure.",
|
||||
Evidence: map[string]string{
|
||||
"swap_used_pct": fmt.Sprintf("%.1f", swapUsedPct),
|
||||
"swap_total_kb": strconv.FormatInt(swapTotalKB, 10),
|
||||
"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
|
||||
},
|
||||
SuggestedFix: "Find swapping processes:\n vmstat 1\n smem -r 2>/dev/null || true\nConsider reducing memory usage or increasing RAM/swap.",
|
||||
})
|
||||
}
|
||||
|
||||
return issues, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
func parseProcMeminfo(content string) map[string]int64 {
|
||||
out := map[string]int64{}
|
||||
s := bufio.NewScanner(strings.NewReader(content))
|
||||
for s.Scan() {
|
||||
line := strings.TrimSpace(s.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
// Example: "MemAvailable: 12345 kB"
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSuffix(fields[0], ":")
|
||||
v, err := strconv.ParseInt(fields[1], 10, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out[key] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func desiredMemPriority(memAvailPct float64) (model.Priority, time.Duration) {
|
||||
switch {
|
||||
case memAvailPct <= 5.0:
|
||||
return model.PriorityP0, 30 * time.Second
|
||||
case memAvailPct <= 10.0:
|
||||
return model.PriorityP1, 60 * time.Second
|
||||
case memAvailPct <= 15.0:
|
||||
return model.PriorityP2, 60 * time.Second
|
||||
default:
|
||||
return "", 0
|
||||
}
|
||||
}
|
||||
|
||||
func desiredSwapPriority(memAvailPct float64, swapTotalKB int64, swapUsedPct float64) (model.Priority, time.Duration) {
|
||||
if swapTotalKB <= 0 {
|
||||
return "", 0
|
||||
}
|
||||
// Only alert on swap when RAM is also tight.
|
||||
switch {
|
||||
case swapUsedPct >= 80.0 && memAvailPct <= 5.0:
|
||||
return model.PriorityP0, 30 * time.Second
|
||||
case swapUsedPct >= 50.0 && memAvailPct <= 10.0:
|
||||
return model.PriorityP1, 60 * time.Second
|
||||
default:
|
||||
return "", 0
|
||||
}
|
||||
}
|
||||
|
||||
// updateSustained updates current severity and its since timestamp.
|
||||
// If desired is empty, it clears the state.
|
||||
func updateSustained(now time.Time, current model.Priority, since time.Time, desired model.Priority) (model.Priority, time.Time) {
|
||||
if desired == "" {
|
||||
return "", time.Time{}
|
||||
}
|
||||
if current != desired || since.IsZero() {
|
||||
return desired, now
|
||||
}
|
||||
return current, since
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*MemCollector)(nil)
|
||||
@@ -0,0 +1,83 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
func TestParseProcMeminfo(t *testing.T) {
|
||||
in := "MemTotal: 8000000 kB\nMemAvailable: 800000 kB\nSwapTotal: 2000000 kB\nSwapFree: 500000 kB\n"
|
||||
m := parseProcMeminfo(in)
|
||||
if m["MemTotal"] != 8000000 {
|
||||
t.Fatalf("MemTotal mismatch: %d", m["MemTotal"])
|
||||
}
|
||||
if m["MemAvailable"] != 800000 {
|
||||
t.Fatalf("MemAvailable mismatch: %d", m["MemAvailable"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestDesiredMemPriority(t *testing.T) {
|
||||
p, w := desiredMemPriority(16.0)
|
||||
if p != "" || w != 0 {
|
||||
t.Fatalf("expected none")
|
||||
}
|
||||
|
||||
p, w = desiredMemPriority(15.0)
|
||||
if p != model.PriorityP2 || w != 60*time.Second {
|
||||
t.Fatalf("expected P2/60s got %v/%v", p, w)
|
||||
}
|
||||
p, w = desiredMemPriority(10.0)
|
||||
if p != model.PriorityP1 {
|
||||
t.Fatalf("expected P1 got %v", p)
|
||||
}
|
||||
p, w = desiredMemPriority(5.0)
|
||||
if p != model.PriorityP0 || w != 30*time.Second {
|
||||
t.Fatalf("expected P0/30s got %v/%v", p, w)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDesiredSwapPriority(t *testing.T) {
|
||||
// No swap configured.
|
||||
p, _ := desiredSwapPriority(4.0, 0, 90.0)
|
||||
if p != "" {
|
||||
t.Fatalf("expected none when SwapTotal=0")
|
||||
}
|
||||
|
||||
p, w := desiredSwapPriority(4.0, 1000, 80.0)
|
||||
if p != model.PriorityP0 || w != 30*time.Second {
|
||||
t.Fatalf("expected P0/30s got %v/%v", p, w)
|
||||
}
|
||||
|
||||
p, w = desiredSwapPriority(9.9, 1000, 50.0)
|
||||
if p != model.PriorityP1 || w != 60*time.Second {
|
||||
t.Fatalf("expected P1/60s got %v/%v", p, w)
|
||||
}
|
||||
|
||||
// Swap high but RAM not tight => no issue.
|
||||
p, _ = desiredSwapPriority(20.0, 1000, 90.0)
|
||||
if p != "" {
|
||||
t.Fatalf("expected none when RAM not tight")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateSustained(t *testing.T) {
|
||||
now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
|
||||
p, since := updateSustained(now, "", time.Time{}, model.PriorityP1)
|
||||
if p != model.PriorityP1 || !since.Equal(now) {
|
||||
t.Fatalf("expected set to P1 at now")
|
||||
}
|
||||
p2, since2 := updateSustained(now.Add(1*time.Second), p, since, model.PriorityP1)
|
||||
if p2 != model.PriorityP1 || !since2.Equal(since) {
|
||||
t.Fatalf("expected unchanged since")
|
||||
}
|
||||
p3, since3 := updateSustained(now.Add(2*time.Second), p2, since2, model.PriorityP0)
|
||||
if p3 != model.PriorityP0 || !since3.Equal(now.Add(2*time.Second)) {
|
||||
t.Fatalf("expected reset on priority change")
|
||||
}
|
||||
p4, since4 := updateSustained(now.Add(3*time.Second), p3, since3, "")
|
||||
if p4 != "" || !since4.IsZero() {
|
||||
t.Fatalf("expected cleared")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// NetCollector checks for missing default route while at least one non-loopback
|
||||
// interface is up.
|
||||
//
|
||||
// Rule (PLAN.md):
|
||||
// - P1 if no default route AND any non-loopback interface is UP.
|
||||
//
|
||||
// Discovery:
|
||||
// - Default route from /proc/net/route
|
||||
// - Interface UP from /sys/class/net/*/operstate
|
||||
//
|
||||
// NOTE: Linux-specific.
|
||||
type NetCollector struct {
|
||||
interval time.Duration
|
||||
|
||||
readFile func(string) ([]byte, error)
|
||||
glob func(string) ([]string, error)
|
||||
}
|
||||
|
||||
func NewNetCollector() *NetCollector {
|
||||
return &NetCollector{
|
||||
interval: 5 * time.Second,
|
||||
readFile: os.ReadFile,
|
||||
glob: filepath.Glob,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *NetCollector) Name() string { return "host:net" }
|
||||
|
||||
func (c *NetCollector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 5 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *NetCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
routeBytes, err := c.readFile("/proc/net/route")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/net/route"}, err
|
||||
}
|
||||
|
||||
hasDefault := hasDefaultRoute(string(routeBytes))
|
||||
|
||||
paths, err := c.glob("/sys/class/net/*/operstate")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed listing /sys/class/net"}, err
|
||||
}
|
||||
upIfaces := make([]string, 0, 2)
|
||||
for _, p := range paths {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
b, err := c.readFile(p)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
iface := filepath.Base(filepath.Dir(p))
|
||||
if iface == "lo" {
|
||||
continue
|
||||
}
|
||||
state := strings.TrimSpace(string(b))
|
||||
if isIfaceUp(state) {
|
||||
upIfaces = append(upIfaces, iface)
|
||||
}
|
||||
}
|
||||
|
||||
if hasDefault || len(upIfaces) == 0 {
|
||||
return nil, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
iss := model.Issue{
|
||||
ID: "host:net:default-route-missing",
|
||||
Category: model.CategoryNetwork,
|
||||
Priority: model.PriorityP1,
|
||||
Title: "No default route",
|
||||
Details: "At least one network interface is up, but no default route is present.",
|
||||
Evidence: map[string]string{
|
||||
"up_ifaces": strings.Join(upIfaces, ","),
|
||||
},
|
||||
SuggestedFix: "Check routing and link state:\n ip route\n ip link\n nmcli dev status\nIf on Wi-Fi, reconnect; if on VPN, verify tunnel routes.",
|
||||
}
|
||||
return []model.Issue{iss}, collectors.OKStatus(), nil
|
||||
}
|
||||
|
||||
func hasDefaultRoute(procNetRoute string) bool {
|
||||
// /proc/net/route header:
|
||||
// Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT
|
||||
// Default route has Destination == 00000000.
|
||||
s := bufio.NewScanner(strings.NewReader(procNetRoute))
|
||||
first := true
|
||||
for s.Scan() {
|
||||
line := strings.TrimSpace(s.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
if first {
|
||||
first = false
|
||||
// skip header if present
|
||||
if strings.HasPrefix(line, "Iface") {
|
||||
continue
|
||||
}
|
||||
}
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
if fields[1] == "00000000" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isIfaceUp(operstate string) bool {
|
||||
// Linux operstate values include: up, down, unknown, dormant, lowerlayerdown.
|
||||
s := strings.ToLower(strings.TrimSpace(operstate))
|
||||
return s == "up" || s == "unknown"
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*NetCollector)(nil)
|
||||
@@ -0,0 +1,28 @@
|
||||
package host
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestHasDefaultRoute(t *testing.T) {
|
||||
in := "Iface\tDestination\tGateway\tFlags\n" +
|
||||
"eth0\t00000000\t0102A8C0\t0003\n"
|
||||
if !hasDefaultRoute(in) {
|
||||
t.Fatalf("expected default route")
|
||||
}
|
||||
in2 := "Iface Destination Gateway Flags\n" +
|
||||
"eth0 0010A8C0 00000000 0001\n"
|
||||
if hasDefaultRoute(in2) {
|
||||
t.Fatalf("expected no default route")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsIfaceUp(t *testing.T) {
|
||||
if !isIfaceUp("up\n") {
|
||||
t.Fatalf("expected true")
|
||||
}
|
||||
if !isIfaceUp("unknown") {
|
||||
t.Fatalf("expected true for unknown")
|
||||
}
|
||||
if isIfaceUp("down") {
|
||||
t.Fatalf("expected false")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
"k8s.io/client-go/tools/clientcmd"
|
||||
)
|
||||
|
||||
// ClientFromCurrentContext creates a Kubernetes client-go Clientset using the
|
||||
// user's kubeconfig current context.
|
||||
//
|
||||
// It is a pure helper (no global state) so it can be used by collectors and
|
||||
// unit tests (with temporary kubeconfig files).
|
||||
func ClientFromCurrentContext() (*kubernetes.Clientset, *rest.Config, error) {
|
||||
loadingRules := clientcmd.NewDefaultClientConfigLoadingRules()
|
||||
|
||||
// Respect KUBECONFIG semantics (it may be a path list).
|
||||
if p := os.Getenv("KUBECONFIG"); p != "" {
|
||||
if list := filepath.SplitList(p); len(list) > 1 {
|
||||
loadingRules.ExplicitPath = ""
|
||||
loadingRules.Precedence = list
|
||||
} else {
|
||||
loadingRules.ExplicitPath = p
|
||||
}
|
||||
}
|
||||
|
||||
cfg := clientcmd.NewNonInteractiveDeferredLoadingClientConfig(loadingRules, &clientcmd.ConfigOverrides{})
|
||||
restCfg, err := cfg.ClientConfig()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// Ensure HTTP client timeouts are bounded. LIST fallback uses its own context
|
||||
// timeouts, but this provides a safety net.
|
||||
if restCfg.Timeout <= 0 {
|
||||
restCfg.Timeout = 30 * time.Second
|
||||
}
|
||||
|
||||
cs, err := kubernetes.NewForConfig(restCfg)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return cs, restCfg, nil
|
||||
}
|
||||
|
||||
func defaultKubeconfigPath() string {
|
||||
// This helper is used only for existence checks / UI messages. Client loading
|
||||
// should use client-go's default loading rules.
|
||||
if p := os.Getenv("KUBECONFIG"); p != "" {
|
||||
// If KUBECONFIG is a list, return the first entry for display.
|
||||
if list := filepath.SplitList(p); len(list) > 0 {
|
||||
return list[0]
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
h, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(h, ".kube", "config")
|
||||
}
|
||||
|
||||
// Ping performs a lightweight API call to determine if the cluster is reachable
|
||||
// and authentication works.
|
||||
func Ping(ctx context.Context, cs kubernetes.Interface) error {
|
||||
if cs == nil {
|
||||
return errors.New("nil kubernetes client")
|
||||
}
|
||||
_, err := cs.Discovery().ServerVersion()
|
||||
if err != nil {
|
||||
// Treat authn/authz errors separately so callers can decide whether to
|
||||
// surface "unreachable" vs "insufficient credentials".
|
||||
if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) {
|
||||
return fmt.Errorf("discovery auth: %w", err)
|
||||
}
|
||||
return fmt.Errorf("discovery server version: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,720 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/client-go/informers"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
appslisters "k8s.io/client-go/listers/apps/v1"
|
||||
corelisters "k8s.io/client-go/listers/core/v1"
|
||||
"k8s.io/client-go/tools/cache"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// Collector is the ControlTower Kubernetes collector.
|
||||
//
|
||||
// It uses client-go informers (LIST+WATCH with local caches) against the user's
|
||||
// kubeconfig current context, across all namespaces.
|
||||
//
|
||||
// Degradation behavior:
|
||||
// - If WATCH fails repeatedly, it falls back to polling LIST and emits a P1
|
||||
// "degraded to polling" issue.
|
||||
// - While in polling mode, it periodically attempts to recover back to watches.
|
||||
// - If the cluster is unreachable, it emits a P0 only after 10s continuous failure.
|
||||
// - If RBAC forbids list/watch for a resource, it emits a single P2 issue per
|
||||
// inaccessible resource and continues for accessible resources.
|
||||
//
|
||||
// Noise control:
|
||||
// - Rollups group by (namespace, reason, kind) when group size >= 20.
|
||||
// - Cap max issues to 200 after rollups.
|
||||
//
|
||||
// Instantiate with NewCollector().
|
||||
type Collector struct {
|
||||
interval time.Duration
|
||||
|
||||
unreachableGrace time.Duration
|
||||
pendingGrace time.Duration
|
||||
workloadGrace time.Duration
|
||||
crashLoopThresh int
|
||||
|
||||
rollupThreshold int
|
||||
maxIssues int
|
||||
|
||||
watchFailureThreshold int
|
||||
watchFailureWindow time.Duration
|
||||
pollRecoverEvery time.Duration
|
||||
|
||||
mu sync.Mutex
|
||||
syncWG sync.WaitGroup
|
||||
|
||||
client kubernetes.Interface
|
||||
|
||||
factory informers.SharedInformerFactory
|
||||
stopCh chan struct{}
|
||||
started bool
|
||||
syncedFns []cache.InformerSynced
|
||||
|
||||
podsLister corelisters.PodLister
|
||||
nodesLister corelisters.NodeLister
|
||||
eventsLister corelisters.EventLister
|
||||
deployLister appslisters.DeploymentLister
|
||||
statefulSetLister appslisters.StatefulSetLister
|
||||
daemonSetLister appslisters.DaemonSetLister
|
||||
|
||||
// polling indicates we have degraded from informers to list polling.
|
||||
polling bool
|
||||
pollSince time.Time
|
||||
lastPollRecoverAttempt time.Time
|
||||
|
||||
watchFailWindowStart time.Time
|
||||
watchFailCount int
|
||||
|
||||
// rbacDenied is keyed by resource name ("pods", "nodes", ...).
|
||||
rbacDenied map[string]error
|
||||
|
||||
unreach *unreachableTracker
|
||||
|
||||
lastSuccess time.Time
|
||||
}
|
||||
|
||||
func NewCollector() *Collector {
|
||||
c := &Collector{
|
||||
interval: 2 * time.Second,
|
||||
unreachableGrace: 10 * time.Second,
|
||||
pendingGrace: 120 * time.Second,
|
||||
workloadGrace: 180 * time.Second,
|
||||
crashLoopThresh: 5,
|
||||
rollupThreshold: 20,
|
||||
maxIssues: 200,
|
||||
watchFailureThreshold: 5,
|
||||
watchFailureWindow: 30 * time.Second,
|
||||
pollRecoverEvery: 30 * time.Second,
|
||||
rbacDenied: map[string]error{},
|
||||
}
|
||||
c.unreach = newUnreachableTracker(c.unreachableGrace)
|
||||
return c
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*Collector)(nil)
|
||||
|
||||
func (c *Collector) Name() string { return "k8s" }
|
||||
|
||||
func (c *Collector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 2 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *Collector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
now := time.Now()
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
// If kubeconfig doesn't exist, treat Kubernetes as "disabled".
|
||||
if !kubeconfigExists() {
|
||||
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "kubeconfig not found"}, nil
|
||||
}
|
||||
|
||||
if err := c.ensureClient(); err != nil {
|
||||
c.unreach.observeFailure(now, err)
|
||||
if c.unreach.shouldEmit(now) {
|
||||
iss := stampIssueTimes(now, unreachableIssue(err))
|
||||
return []model.Issue{iss}, collectors.Status{Health: collectors.HealthError, Message: "unreachable"}, nil
|
||||
}
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "k8s client init failed (grace)"}, nil
|
||||
}
|
||||
|
||||
// Connectivity/auth check with grace.
|
||||
if err := Ping(ctx, c.client); err != nil {
|
||||
c.unreach.observeFailure(now, err)
|
||||
if c.unreach.shouldEmit(now) {
|
||||
iss := stampIssueTimes(now, unreachableIssue(err))
|
||||
return []model.Issue{iss}, collectors.Status{Health: collectors.HealthError, Message: "unreachable"}, nil
|
||||
}
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "k8s unreachable (grace)"}, nil
|
||||
}
|
||||
c.unreach.observeSuccess()
|
||||
c.lastSuccess = now
|
||||
|
||||
// Prefer informers unless currently degraded to polling.
|
||||
if c.isPolling() {
|
||||
c.maybeRecoverInformers(ctx, now)
|
||||
}
|
||||
if !c.isPolling() {
|
||||
_ = c.ensureInformers(ctx)
|
||||
}
|
||||
|
||||
issues := make([]model.Issue, 0, 64)
|
||||
issues = append(issues, c.rbacIssues()...)
|
||||
|
||||
st := collectors.Status{Health: collectors.HealthOK, LastSuccess: c.lastSuccess}
|
||||
|
||||
if c.isPolling() {
|
||||
st.Health = collectors.HealthDegraded
|
||||
st.Message = "degraded to polling"
|
||||
issues = append(issues, stampIssueTimes(now, pollingDegradedIssue()))
|
||||
issues = append(issues, c.collectByPolling(ctx, now)...)
|
||||
} else {
|
||||
// If caches aren't ready, use polling for this tick only.
|
||||
if !c.cachesSyncedQuick(ctx) {
|
||||
st.Health = collectors.HealthDegraded
|
||||
st.Message = "waiting for informer cache; used list"
|
||||
issues = append(issues, c.collectByPolling(ctx, now)...)
|
||||
} else {
|
||||
issues = append(issues, c.collectFromCaches(now)...)
|
||||
if len(c.snapshotRBACDenied()) > 0 {
|
||||
st.Health = collectors.HealthDegraded
|
||||
st.Message = "partial RBAC access"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set timestamps, roll up and cap.
|
||||
for i := range issues {
|
||||
issues[i] = stampIssueTimes(now, issues[i])
|
||||
}
|
||||
issues = Rollup(issues, c.rollupThreshold, 5)
|
||||
model.SortIssuesDefault(issues)
|
||||
issues = CapIssues(issues, c.maxIssues)
|
||||
|
||||
return issues, st, nil
|
||||
}
|
||||
|
||||
func (c *Collector) ensureClient() error {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if c.client != nil {
|
||||
return nil
|
||||
}
|
||||
cs, _, err := ClientFromCurrentContext()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.client = cs
|
||||
return nil
|
||||
}
|
||||
|
||||
func kubeconfigExists() bool {
|
||||
if p := os.Getenv("KUBECONFIG"); p != "" {
|
||||
for _, fp := range filepath.SplitList(p) {
|
||||
if fp == "" {
|
||||
continue
|
||||
}
|
||||
if _, err := os.Stat(fp); err == nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
p := defaultKubeconfigPath()
|
||||
if p == "" {
|
||||
return false
|
||||
}
|
||||
_, err := os.Stat(p)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func (c *Collector) ensureInformers(ctx context.Context) error {
|
||||
c.mu.Lock()
|
||||
if c.started || c.polling {
|
||||
c.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
client := c.client
|
||||
c.mu.Unlock()
|
||||
if client == nil {
|
||||
return fmt.Errorf("nil kubernetes client")
|
||||
}
|
||||
|
||||
// RBAC preflight before we even construct informers (so we can skip forbidden ones).
|
||||
c.preflightRBAC(ctx, client)
|
||||
|
||||
factory := informers.NewSharedInformerFactory(client, 0)
|
||||
|
||||
var (
|
||||
podsInf cache.SharedIndexInformer
|
||||
nodesInf cache.SharedIndexInformer
|
||||
evsInf cache.SharedIndexInformer
|
||||
depInf cache.SharedIndexInformer
|
||||
stsInf cache.SharedIndexInformer
|
||||
dsInf cache.SharedIndexInformer
|
||||
)
|
||||
|
||||
if !c.isRBACDenied("pods") {
|
||||
i := factory.Core().V1().Pods()
|
||||
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("pods", err) })
|
||||
c.mu.Lock()
|
||||
c.podsLister = i.Lister()
|
||||
c.mu.Unlock()
|
||||
podsInf = i.Informer()
|
||||
}
|
||||
if !c.isRBACDenied("nodes") {
|
||||
i := factory.Core().V1().Nodes()
|
||||
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("nodes", err) })
|
||||
c.mu.Lock()
|
||||
c.nodesLister = i.Lister()
|
||||
c.mu.Unlock()
|
||||
nodesInf = i.Informer()
|
||||
}
|
||||
if !c.isRBACDenied("events") {
|
||||
i := factory.Core().V1().Events()
|
||||
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("events", err) })
|
||||
c.mu.Lock()
|
||||
c.eventsLister = i.Lister()
|
||||
c.mu.Unlock()
|
||||
evsInf = i.Informer()
|
||||
}
|
||||
if !c.isRBACDenied("deployments") {
|
||||
i := factory.Apps().V1().Deployments()
|
||||
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("deployments", err) })
|
||||
c.mu.Lock()
|
||||
c.deployLister = i.Lister()
|
||||
c.mu.Unlock()
|
||||
depInf = i.Informer()
|
||||
}
|
||||
if !c.isRBACDenied("statefulsets") {
|
||||
i := factory.Apps().V1().StatefulSets()
|
||||
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("statefulsets", err) })
|
||||
c.mu.Lock()
|
||||
c.statefulSetLister = i.Lister()
|
||||
c.mu.Unlock()
|
||||
stsInf = i.Informer()
|
||||
}
|
||||
if !c.isRBACDenied("daemonsets") {
|
||||
i := factory.Apps().V1().DaemonSets()
|
||||
i.Informer().SetWatchErrorHandler(func(_ *cache.Reflector, err error) { c.recordWatchError("daemonsets", err) })
|
||||
c.mu.Lock()
|
||||
c.daemonSetLister = i.Lister()
|
||||
c.mu.Unlock()
|
||||
dsInf = i.Informer()
|
||||
}
|
||||
|
||||
synced := make([]cache.InformerSynced, 0, 6)
|
||||
if podsInf != nil {
|
||||
synced = append(synced, podsInf.HasSynced)
|
||||
}
|
||||
if nodesInf != nil {
|
||||
synced = append(synced, nodesInf.HasSynced)
|
||||
}
|
||||
if evsInf != nil {
|
||||
synced = append(synced, evsInf.HasSynced)
|
||||
}
|
||||
if depInf != nil {
|
||||
synced = append(synced, depInf.HasSynced)
|
||||
}
|
||||
if stsInf != nil {
|
||||
synced = append(synced, stsInf.HasSynced)
|
||||
}
|
||||
if dsInf != nil {
|
||||
synced = append(synced, dsInf.HasSynced)
|
||||
}
|
||||
|
||||
stopCh := make(chan struct{})
|
||||
|
||||
c.mu.Lock()
|
||||
c.factory = factory
|
||||
c.stopCh = stopCh
|
||||
c.started = true
|
||||
c.syncedFns = synced
|
||||
c.mu.Unlock()
|
||||
|
||||
factory.Start(stopCh)
|
||||
|
||||
c.syncWG.Add(1)
|
||||
go func() {
|
||||
defer c.syncWG.Done()
|
||||
syncCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
if ok := cache.WaitForCacheSync(syncCtx.Done(), synced...); !ok {
|
||||
fmt.Printf("k8s: informer cache sync failed or timed out\n")
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Collector) maybeRecoverInformers(ctx context.Context, now time.Time) {
|
||||
c.mu.Lock()
|
||||
interval := c.pollRecoverEvery
|
||||
last := c.lastPollRecoverAttempt
|
||||
c.mu.Unlock()
|
||||
|
||||
if interval <= 0 {
|
||||
interval = 30 * time.Second
|
||||
}
|
||||
if !last.IsZero() && now.Sub(last) < interval {
|
||||
return
|
||||
}
|
||||
|
||||
c.mu.Lock()
|
||||
c.lastPollRecoverAttempt = now
|
||||
c.mu.Unlock()
|
||||
|
||||
// Only attempt if connectivity is OK (already pinged successfully in Collect).
|
||||
// Reset watch failure counters and exit polling; subsequent Collect will ensureInformers.
|
||||
c.mu.Lock()
|
||||
c.polling = false
|
||||
c.pollSince = time.Time{}
|
||||
c.watchFailWindowStart = time.Time{}
|
||||
c.watchFailCount = 0
|
||||
c.mu.Unlock()
|
||||
|
||||
_ = c.ensureInformers(ctx)
|
||||
}
|
||||
|
||||
func (c *Collector) preflightRBAC(ctx context.Context, client kubernetes.Interface) {
|
||||
shortCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
probe := func(resource string, f func(context.Context) error) {
|
||||
if err := f(shortCtx); err != nil {
|
||||
if apierrors.IsForbidden(err) {
|
||||
c.noteRBAC(resource, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
probe("nodes", func(ctx context.Context) error {
|
||||
_, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{Limit: 1})
|
||||
return err
|
||||
})
|
||||
probe("pods", func(ctx context.Context) error {
|
||||
_, err := client.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
|
||||
return err
|
||||
})
|
||||
probe("deployments", func(ctx context.Context) error {
|
||||
_, err := client.AppsV1().Deployments(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
|
||||
return err
|
||||
})
|
||||
probe("statefulsets", func(ctx context.Context) error {
|
||||
_, err := client.AppsV1().StatefulSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
|
||||
return err
|
||||
})
|
||||
probe("daemonsets", func(ctx context.Context) error {
|
||||
_, err := client.AppsV1().DaemonSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
|
||||
return err
|
||||
})
|
||||
probe("events", func(ctx context.Context) error {
|
||||
_, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, metav1.ListOptions{Limit: 1})
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
func (c *Collector) noteRBAC(resource string, err error) {
|
||||
if err == nil || !apierrors.IsForbidden(err) {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if _, ok := c.rbacDenied[resource]; ok {
|
||||
return
|
||||
}
|
||||
c.rbacDenied[resource] = err
|
||||
}
|
||||
|
||||
func (c *Collector) isRBACDenied(resource string) bool {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
_, ok := c.rbacDenied[resource]
|
||||
return ok
|
||||
}
|
||||
|
||||
func (c *Collector) snapshotRBACDenied() map[string]error {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := make(map[string]error, len(c.rbacDenied))
|
||||
for k, v := range c.rbacDenied {
|
||||
out[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (c *Collector) recordWatchError(resource string, err error) {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
if apierrors.IsForbidden(err) {
|
||||
c.noteRBAC(resource, err)
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now()
|
||||
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if c.polling {
|
||||
return
|
||||
}
|
||||
if c.watchFailWindowStart.IsZero() || now.Sub(c.watchFailWindowStart) > c.watchFailureWindow {
|
||||
c.watchFailWindowStart = now
|
||||
c.watchFailCount = 0
|
||||
}
|
||||
c.watchFailCount++
|
||||
if c.watchFailCount >= c.watchFailureThreshold {
|
||||
c.polling = true
|
||||
c.pollSince = now
|
||||
if c.stopCh != nil {
|
||||
close(c.stopCh)
|
||||
c.stopCh = nil
|
||||
}
|
||||
c.started = false
|
||||
c.factory = nil
|
||||
c.syncedFns = nil
|
||||
c.syncWG.Wait()
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Collector) cachesSyncedQuick(ctx context.Context) bool {
|
||||
c.mu.Lock()
|
||||
synced := append([]cache.InformerSynced(nil), c.syncedFns...)
|
||||
c.mu.Unlock()
|
||||
if len(synced) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
syncCtx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
||||
defer cancel()
|
||||
return cache.WaitForCacheSync(syncCtx.Done(), synced...)
|
||||
}
|
||||
|
||||
func (c *Collector) collectFromCaches(now time.Time) []model.Issue {
|
||||
c.mu.Lock()
|
||||
podsLister := c.podsLister
|
||||
nodesLister := c.nodesLister
|
||||
eventsLister := c.eventsLister
|
||||
deployLister := c.deployLister
|
||||
stsLister := c.statefulSetLister
|
||||
dsLister := c.daemonSetLister
|
||||
denied := make(map[string]error, len(c.rbacDenied))
|
||||
for k, v := range c.rbacDenied {
|
||||
denied[k] = v
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
issues := make([]model.Issue, 0, 64)
|
||||
sel := labels.Everything()
|
||||
|
||||
if _, ok := denied["nodes"]; !ok && nodesLister != nil {
|
||||
if list, err := nodesLister.List(sel); err == nil {
|
||||
nodes := make([]*corev1.Node, 0, len(list))
|
||||
for i := range list {
|
||||
nodes = append(nodes, list[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromNodes(nodes)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["pods"]; !ok && podsLister != nil {
|
||||
if list, err := podsLister.List(sel); err == nil {
|
||||
pods := make([]*corev1.Pod, 0, len(list))
|
||||
for i := range list {
|
||||
pods = append(pods, list[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromPods(pods, now, c.pendingGrace, c.crashLoopThresh)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["deployments"]; !ok && deployLister != nil {
|
||||
if list, err := deployLister.List(sel); err == nil {
|
||||
deps := make([]*appsv1.Deployment, 0, len(list))
|
||||
for i := range list {
|
||||
deps = append(deps, list[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromDeployments(deps, now, c.workloadGrace)...)
|
||||
}
|
||||
}
|
||||
if _, ok := denied["statefulsets"]; !ok && stsLister != nil {
|
||||
if list, err := stsLister.List(sel); err == nil {
|
||||
sts := make([]*appsv1.StatefulSet, 0, len(list))
|
||||
for i := range list {
|
||||
sts = append(sts, list[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromStatefulSets(sts, now, c.workloadGrace)...)
|
||||
}
|
||||
}
|
||||
if _, ok := denied["daemonsets"]; !ok && dsLister != nil {
|
||||
if list, err := dsLister.List(sel); err == nil {
|
||||
dss := make([]*appsv1.DaemonSet, 0, len(list))
|
||||
for i := range list {
|
||||
dss = append(dss, list[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromDaemonSets(dss, now, c.workloadGrace)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["events"]; !ok && eventsLister != nil {
|
||||
if list, err := eventsLister.List(sel); err == nil {
|
||||
es := make([]*corev1.Event, 0, len(list))
|
||||
for i := range list {
|
||||
es = append(es, list[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromEvents(es, now)...)
|
||||
}
|
||||
}
|
||||
|
||||
return issues
|
||||
}
|
||||
|
||||
func (c *Collector) collectByPolling(ctx context.Context, now time.Time) []model.Issue {
|
||||
c.mu.Lock()
|
||||
client := c.client
|
||||
denied := make(map[string]error, len(c.rbacDenied))
|
||||
for k, v := range c.rbacDenied {
|
||||
denied[k] = v
|
||||
}
|
||||
c.mu.Unlock()
|
||||
if client == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
issues := make([]model.Issue, 0, 64)
|
||||
|
||||
if _, ok := denied["nodes"]; !ok {
|
||||
if nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}); err != nil {
|
||||
c.noteRBAC("nodes", err)
|
||||
} else {
|
||||
list := make([]*corev1.Node, 0, len(nodes.Items))
|
||||
for i := range nodes.Items {
|
||||
list = append(list, &nodes.Items[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromNodes(list)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["pods"]; !ok {
|
||||
if pods, err := client.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
|
||||
c.noteRBAC("pods", err)
|
||||
} else {
|
||||
list := make([]*corev1.Pod, 0, len(pods.Items))
|
||||
for i := range pods.Items {
|
||||
list = append(list, &pods.Items[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromPods(list, now, c.pendingGrace, c.crashLoopThresh)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["deployments"]; !ok {
|
||||
if deps, err := client.AppsV1().Deployments(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
|
||||
c.noteRBAC("deployments", err)
|
||||
} else {
|
||||
list := make([]*appsv1.Deployment, 0, len(deps.Items))
|
||||
for i := range deps.Items {
|
||||
list = append(list, &deps.Items[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromDeployments(list, now, c.workloadGrace)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["statefulsets"]; !ok {
|
||||
if sts, err := client.AppsV1().StatefulSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
|
||||
c.noteRBAC("statefulsets", err)
|
||||
} else {
|
||||
list := make([]*appsv1.StatefulSet, 0, len(sts.Items))
|
||||
for i := range sts.Items {
|
||||
list = append(list, &sts.Items[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromStatefulSets(list, now, c.workloadGrace)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["daemonsets"]; !ok {
|
||||
if dss, err := client.AppsV1().DaemonSets(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
|
||||
c.noteRBAC("daemonsets", err)
|
||||
} else {
|
||||
list := make([]*appsv1.DaemonSet, 0, len(dss.Items))
|
||||
for i := range dss.Items {
|
||||
list = append(list, &dss.Items[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromDaemonSets(list, now, c.workloadGrace)...)
|
||||
}
|
||||
}
|
||||
|
||||
if _, ok := denied["events"]; !ok {
|
||||
if evs, err := client.CoreV1().Events(metav1.NamespaceAll).List(ctx, metav1.ListOptions{}); err != nil {
|
||||
c.noteRBAC("events", err)
|
||||
} else {
|
||||
list := make([]*corev1.Event, 0, len(evs.Items))
|
||||
for i := range evs.Items {
|
||||
list = append(list, &evs.Items[i])
|
||||
}
|
||||
issues = append(issues, IssuesFromEvents(list, now)...)
|
||||
}
|
||||
}
|
||||
|
||||
return issues
|
||||
}
|
||||
|
||||
func (c *Collector) rbacIssues() []model.Issue {
|
||||
denied := c.snapshotRBACDenied()
|
||||
keys := make([]string, 0, len(denied))
|
||||
for k := range denied {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
out := make([]model.Issue, 0, len(keys))
|
||||
for _, res := range keys {
|
||||
err := denied[res]
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:rbac:%s", res),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP2,
|
||||
Title: fmt.Sprintf("Insufficient RBAC: list/watch %s", res),
|
||||
Details: fmt.Sprintf("Current context cannot access %s (forbidden). %s", res, sanitizeError(err)),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Cluster",
|
||||
"reason": "RBAC",
|
||||
"namespace": "",
|
||||
"resource": res,
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl auth can-i list %s --all-namespaces", res),
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func pollingDegradedIssue() model.Issue {
|
||||
return model.Issue{
|
||||
ID: "k8s:cluster:polling",
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: "Kubernetes degraded: polling (watch failing)",
|
||||
Details: "Kubernetes watches have failed repeatedly; collector switched to LIST polling. Data may be less real-time and API load is higher.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "Cluster",
|
||||
"reason": "DegradedPolling",
|
||||
"namespace": "",
|
||||
},
|
||||
SuggestedFix: "Check API server / network stability and RBAC; ensure watch endpoints are reachable.",
|
||||
}
|
||||
}
|
||||
|
||||
func stampIssueTimes(now time.Time, iss model.Issue) model.Issue {
|
||||
iss.LastSeen = now
|
||||
if iss.FirstSeen.IsZero() {
|
||||
iss.FirstSeen = now
|
||||
}
|
||||
return iss
|
||||
}
|
||||
|
||||
func (c *Collector) isPolling() bool {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
return c.polling
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
var warningEventReasons = map[string]struct{}{
|
||||
"FailedScheduling": {},
|
||||
"FailedMount": {},
|
||||
"BackOff": {},
|
||||
"Unhealthy": {},
|
||||
"OOMKilling": {},
|
||||
"FailedPull": {},
|
||||
"Forbidden": {},
|
||||
"ErrImagePull": {},
|
||||
"ImagePullBackOff": {},
|
||||
}
|
||||
|
||||
// IssuesFromEvents applies the PLAN.md Event rules.
|
||||
//
|
||||
// Dedup by (object UID, reason). For v1 Events, this is approximated by
|
||||
// (involvedObject.uid, reason).
|
||||
func IssuesFromEvents(events []*corev1.Event, now time.Time) []model.Issue {
|
||||
_ = now
|
||||
out := make([]model.Issue, 0, 16)
|
||||
seen := map[string]struct{}{}
|
||||
|
||||
for _, e := range events {
|
||||
if e == nil {
|
||||
continue
|
||||
}
|
||||
if strings.ToLower(e.Type) != strings.ToLower(string(corev1.EventTypeWarning)) {
|
||||
continue
|
||||
}
|
||||
if _, ok := warningEventReasons[e.Reason]; !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
uid := string(e.InvolvedObject.UID)
|
||||
k := uid + ":" + e.Reason
|
||||
if _, ok := seen[k]; ok {
|
||||
continue
|
||||
}
|
||||
seen[k] = struct{}{}
|
||||
|
||||
ns := e.InvolvedObject.Namespace
|
||||
if ns == "" {
|
||||
ns = e.Namespace
|
||||
}
|
||||
|
||||
objKey := e.InvolvedObject.Kind + "/" + e.InvolvedObject.Name
|
||||
title := fmt.Sprintf("K8s Event %s: %s (%s)", e.Reason, objKey, ns)
|
||||
if ns == "" {
|
||||
title = fmt.Sprintf("K8s Event %s: %s", e.Reason, objKey)
|
||||
}
|
||||
|
||||
details := strings.TrimSpace(e.Message)
|
||||
if details == "" {
|
||||
details = "Warning event emitted by Kubernetes."
|
||||
}
|
||||
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:event:%s:%s", uid, e.Reason),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP2,
|
||||
Title: title,
|
||||
Details: details,
|
||||
Evidence: map[string]string{
|
||||
"kind": e.InvolvedObject.Kind,
|
||||
"reason": e.Reason,
|
||||
"namespace": ns,
|
||||
"name": e.InvolvedObject.Name,
|
||||
"uid": uid,
|
||||
},
|
||||
SuggestedFix: suggestedFixForEvent(ns, e.InvolvedObject.Kind, e.InvolvedObject.Name),
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func suggestedFixForEvent(ns, kind, name string) string {
|
||||
kindLower := strings.ToLower(kind)
|
||||
if ns != "" {
|
||||
switch kindLower {
|
||||
case "pod":
|
||||
return fmt.Sprintf("kubectl -n %s describe pod %s", ns, name)
|
||||
case "node":
|
||||
return fmt.Sprintf("kubectl describe node %s", name)
|
||||
default:
|
||||
return fmt.Sprintf("kubectl -n %s describe %s %s", ns, kindLower, name)
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf("kubectl describe %s %s", kindLower, name)
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
//go:build ignore
|
||||
|
||||
package k8s
|
||||
|
||||
// Placeholder (see rollup_test.go).
|
||||
@@ -0,0 +1,79 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// IssuesFromNodes applies the PLAN.md node rules.
|
||||
//
|
||||
// Pure rule function: does not talk to the API server.
|
||||
func IssuesFromNodes(nodes []*corev1.Node) []model.Issue {
|
||||
out := make([]model.Issue, 0, 8)
|
||||
for _, n := range nodes {
|
||||
if n == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Ready / NotReady
|
||||
if cond := findNodeCondition(n, corev1.NodeReady); cond != nil {
|
||||
if cond.Status != corev1.ConditionTrue {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:node:%s:NotReady", n.Name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP0,
|
||||
Title: fmt.Sprintf("Node NotReady: %s", n.Name),
|
||||
Details: cond.Message,
|
||||
Evidence: map[string]string{
|
||||
"kind": "Node",
|
||||
"reason": "NotReady",
|
||||
"namespace": "",
|
||||
"node": n.Name,
|
||||
"status": string(cond.Status),
|
||||
},
|
||||
SuggestedFix: "kubectl describe node " + n.Name,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Pressure conditions.
|
||||
for _, ctype := range []corev1.NodeConditionType{corev1.NodeMemoryPressure, corev1.NodeDiskPressure, corev1.NodePIDPressure} {
|
||||
if cond := findNodeCondition(n, ctype); cond != nil {
|
||||
if cond.Status == corev1.ConditionTrue {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:node:%s:%s", n.Name, string(ctype)),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("Node %s: %s", ctype, n.Name),
|
||||
Details: cond.Message,
|
||||
Evidence: map[string]string{
|
||||
"kind": "Node",
|
||||
"reason": string(ctype),
|
||||
"namespace": "",
|
||||
"node": n.Name,
|
||||
"status": string(cond.Status),
|
||||
},
|
||||
SuggestedFix: "kubectl describe node " + n.Name,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func findNodeCondition(n *corev1.Node, t corev1.NodeConditionType) *corev1.NodeCondition {
|
||||
if n == nil {
|
||||
return nil
|
||||
}
|
||||
for i := range n.Status.Conditions {
|
||||
c := &n.Status.Conditions[i]
|
||||
if c.Type == t {
|
||||
return c
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
//go:build ignore
|
||||
|
||||
package k8s
|
||||
|
||||
// Placeholder (see rollup_test.go).
|
||||
@@ -0,0 +1,169 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// IssuesFromPods applies the PLAN.md pod rules.
|
||||
//
|
||||
// Pure rule function: it does not talk to the API server.
|
||||
func IssuesFromPods(pods []*corev1.Pod, now time.Time, pendingGrace time.Duration, crashLoopRestartThreshold int) []model.Issue {
|
||||
if crashLoopRestartThreshold <= 0 {
|
||||
crashLoopRestartThreshold = 5
|
||||
}
|
||||
if pendingGrace <= 0 {
|
||||
pendingGrace = 120 * time.Second
|
||||
}
|
||||
|
||||
out := make([]model.Issue, 0, 32)
|
||||
for _, p := range pods {
|
||||
if p == nil {
|
||||
continue
|
||||
}
|
||||
ns, name := p.Namespace, p.Name
|
||||
|
||||
// Pending for too long.
|
||||
if p.Status.Phase == corev1.PodPending {
|
||||
age := now.Sub(p.CreationTimestamp.Time)
|
||||
if !p.CreationTimestamp.IsZero() && age >= pendingGrace {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:Pending", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("Pod Pending: %s/%s", ns, name),
|
||||
Details: fmt.Sprintf("Pod has been Pending for %s.", age.Truncate(time.Second)),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "Pending",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"phase": string(p.Status.Phase),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Container-derived signals.
|
||||
for _, cs := range p.Status.ContainerStatuses {
|
||||
cname := cs.Name
|
||||
restarts := int(cs.RestartCount)
|
||||
|
||||
// CrashLoopBackOff and pull errors are reported via Waiting state.
|
||||
if cs.State.Waiting != nil {
|
||||
reason := cs.State.Waiting.Reason
|
||||
msg := cs.State.Waiting.Message
|
||||
switch reason {
|
||||
case "CrashLoopBackOff":
|
||||
pri := model.PriorityP1
|
||||
if restarts >= crashLoopRestartThreshold {
|
||||
pri = model.PriorityP0
|
||||
}
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:CrashLoop:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: pri,
|
||||
Title: fmt.Sprintf("CrashLoopBackOff: %s/%s (%s)", ns, name, cname),
|
||||
Details: firstNonEmpty(msg, "Container is in CrashLoopBackOff."),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "CrashLoopBackOff",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
|
||||
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
|
||||
})
|
||||
|
||||
case "ImagePullBackOff", "ErrImagePull":
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:ImagePull:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("%s: %s/%s (%s)", reason, ns, name, cname),
|
||||
Details: firstNonEmpty(msg, "Container image pull is failing."),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": reason,
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// OOMKilled is typically stored in LastTerminationState.
|
||||
if cs.LastTerminationState.Terminated != nil {
|
||||
term := cs.LastTerminationState.Terminated
|
||||
if term.Reason == "OOMKilled" {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:OOMKilled:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("OOMKilled: %s/%s (%s)", ns, name, cname),
|
||||
Details: firstNonEmpty(term.Message, "Container was killed due to OOM."),
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "OOMKilled",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: strings.TrimSpace(fmt.Sprintf(`kubectl -n %s describe pod %s
|
||||
kubectl -n %s logs %s -c %s --previous`, ns, name, ns, name, cname)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// High restarts even if running.
|
||||
// Keep this lower priority than active CrashLoopBackOff.
|
||||
if restarts >= crashLoopRestartThreshold {
|
||||
if cs.State.Waiting == nil || cs.State.Waiting.Reason == "" {
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:pod:%s/%s:Restarts:%s", ns, name, cname),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP2,
|
||||
Title: fmt.Sprintf("High restarts: %s/%s (%s)", ns, name, cname),
|
||||
Details: "Container has restarted multiple times.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "Pod",
|
||||
"reason": "HighRestarts",
|
||||
"namespace": ns,
|
||||
"pod": name,
|
||||
"container": cname,
|
||||
"restarts": strconv.Itoa(restarts),
|
||||
"node": p.Spec.NodeName,
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe pod %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(v, fallback string) string {
|
||||
if strings.TrimSpace(v) != "" {
|
||||
return v
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
//go:build ignore
|
||||
|
||||
package k8s
|
||||
|
||||
// Placeholder (see rollup_test.go).
|
||||
@@ -0,0 +1,174 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// WorkloadGrace tracks how long a workload must be NotReady before we emit an issue.
|
||||
const defaultWorkloadNotReadyGrace = 180 * time.Second
|
||||
|
||||
// IssuesFromDeployments applies the PLAN.md workload rules for Deployments.
|
||||
func IssuesFromDeployments(deploys []*appsv1.Deployment, now time.Time, grace time.Duration) []model.Issue {
|
||||
if grace <= 0 {
|
||||
grace = defaultWorkloadNotReadyGrace
|
||||
}
|
||||
out := make([]model.Issue, 0, 16)
|
||||
|
||||
for _, d := range deploys {
|
||||
if d == nil {
|
||||
continue
|
||||
}
|
||||
desired := int32(1)
|
||||
if d.Spec.Replicas != nil {
|
||||
desired = *d.Spec.Replicas
|
||||
}
|
||||
ready := d.Status.ReadyReplicas
|
||||
if desired > 0 && ready < desired {
|
||||
// Prefer LastUpdateTime / LastTransitionTime when available; fallback to creation time.
|
||||
since := d.CreationTimestamp.Time
|
||||
if cond := findDeploymentProgressingCondition(d); cond != nil {
|
||||
if !cond.LastUpdateTime.IsZero() {
|
||||
since = cond.LastUpdateTime.Time
|
||||
} else if !cond.LastTransitionTime.IsZero() {
|
||||
since = cond.LastTransitionTime.Time
|
||||
}
|
||||
}
|
||||
if !since.IsZero() && now.Sub(since) < grace {
|
||||
continue
|
||||
}
|
||||
|
||||
ns := d.Namespace
|
||||
name := d.Name
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:deploy:%s/%s:NotReady", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("Deployment not ready: %s/%s", ns, name),
|
||||
Details: "Ready replicas below desired.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "Deployment",
|
||||
"reason": "NotReady",
|
||||
"namespace": ns,
|
||||
"name": name,
|
||||
"desired": strconv.Itoa(int(desired)),
|
||||
"ready": strconv.Itoa(int(ready)),
|
||||
"observed_gen": strconv.FormatInt(d.Status.ObservedGeneration, 10),
|
||||
"resource_gen": strconv.FormatInt(d.Generation, 10),
|
||||
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe deployment %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// IssuesFromStatefulSets applies the PLAN.md workload rules for StatefulSets.
|
||||
func IssuesFromStatefulSets(sts []*appsv1.StatefulSet, now time.Time, grace time.Duration) []model.Issue {
|
||||
if grace <= 0 {
|
||||
grace = defaultWorkloadNotReadyGrace
|
||||
}
|
||||
out := make([]model.Issue, 0, 16)
|
||||
|
||||
for _, s := range sts {
|
||||
if s == nil {
|
||||
continue
|
||||
}
|
||||
desired := int32(1)
|
||||
if s.Spec.Replicas != nil {
|
||||
desired = *s.Spec.Replicas
|
||||
}
|
||||
ready := s.Status.ReadyReplicas
|
||||
if desired > 0 && ready < desired {
|
||||
since := s.CreationTimestamp.Time
|
||||
if !since.IsZero() && now.Sub(since) < grace {
|
||||
continue
|
||||
}
|
||||
|
||||
ns, name := s.Namespace, s.Name
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:sts:%s/%s:NotReady", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("StatefulSet not ready: %s/%s", ns, name),
|
||||
Details: "Ready replicas below desired.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "StatefulSet",
|
||||
"reason": "NotReady",
|
||||
"namespace": ns,
|
||||
"name": name,
|
||||
"desired": strconv.Itoa(int(desired)),
|
||||
"ready": strconv.Itoa(int(ready)),
|
||||
"observed_gen": strconv.FormatInt(s.Status.ObservedGeneration, 10),
|
||||
"resource_gen": strconv.FormatInt(s.Generation, 10),
|
||||
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe statefulset %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// IssuesFromDaemonSets applies the PLAN.md workload rules for DaemonSets.
|
||||
func IssuesFromDaemonSets(dss []*appsv1.DaemonSet, now time.Time, grace time.Duration) []model.Issue {
|
||||
if grace <= 0 {
|
||||
grace = defaultWorkloadNotReadyGrace
|
||||
}
|
||||
out := make([]model.Issue, 0, 16)
|
||||
|
||||
for _, ds := range dss {
|
||||
if ds == nil {
|
||||
continue
|
||||
}
|
||||
unavailable := ds.Status.NumberUnavailable
|
||||
if unavailable > 0 {
|
||||
since := ds.CreationTimestamp.Time
|
||||
if !since.IsZero() && now.Sub(since) < grace {
|
||||
continue
|
||||
}
|
||||
ns, name := ds.Namespace, ds.Name
|
||||
out = append(out, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:ds:%s/%s:Unavailable", ns, name),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP1,
|
||||
Title: fmt.Sprintf("DaemonSet unavailable: %s/%s", ns, name),
|
||||
Details: "DaemonSet has unavailable pods.",
|
||||
Evidence: map[string]string{
|
||||
"kind": "DaemonSet",
|
||||
"reason": "Unavailable",
|
||||
"namespace": ns,
|
||||
"name": name,
|
||||
"unavailable": strconv.Itoa(int(unavailable)),
|
||||
"desired": strconv.Itoa(int(ds.Status.DesiredNumberScheduled)),
|
||||
"available": strconv.Itoa(int(ds.Status.NumberAvailable)),
|
||||
"min_grace_sec": strconv.Itoa(int(grace.Seconds())),
|
||||
},
|
||||
SuggestedFix: fmt.Sprintf("kubectl -n %s describe daemonset %s", ns, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func findDeploymentProgressingCondition(d *appsv1.Deployment) *appsv1.DeploymentCondition {
|
||||
if d == nil {
|
||||
return nil
|
||||
}
|
||||
for i := range d.Status.Conditions {
|
||||
c := &d.Status.Conditions[i]
|
||||
if c.Type == appsv1.DeploymentProgressing {
|
||||
return c
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
//go:build ignore
|
||||
|
||||
package k8s
|
||||
|
||||
// Placeholder (see rollup_test.go).
|
||||
@@ -0,0 +1,128 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// RollupKey groups similar issues to reduce UI noise.
|
||||
// Required grouping per prompt: (namespace, reason, kind).
|
||||
type RollupKey struct {
|
||||
Namespace string
|
||||
Reason string
|
||||
Kind string
|
||||
}
|
||||
|
||||
// Rollup groups issues by (namespace, reason, kind). For any group with size >=
|
||||
// threshold, it emits a single rollup issue and removes the individual issues
|
||||
// from the output.
|
||||
//
|
||||
// Rollup issues use Priority of the max priority in the group.
|
||||
func Rollup(issues []model.Issue, threshold int, sampleN int) []model.Issue {
|
||||
if threshold <= 0 {
|
||||
threshold = 20
|
||||
}
|
||||
if sampleN <= 0 {
|
||||
sampleN = 5
|
||||
}
|
||||
|
||||
groups := make(map[RollupKey][]model.Issue, 32)
|
||||
ungrouped := make([]model.Issue, 0, len(issues))
|
||||
|
||||
for _, iss := range issues {
|
||||
kind := strings.TrimSpace(iss.Evidence["kind"])
|
||||
reason := strings.TrimSpace(iss.Evidence["reason"])
|
||||
ns := strings.TrimSpace(iss.Evidence["namespace"])
|
||||
if kind == "" || reason == "" {
|
||||
ungrouped = append(ungrouped, iss)
|
||||
continue
|
||||
}
|
||||
k := RollupKey{Namespace: ns, Reason: reason, Kind: kind}
|
||||
groups[k] = append(groups[k], iss)
|
||||
}
|
||||
|
||||
rolled := make([]model.Issue, 0, len(issues))
|
||||
rolled = append(rolled, ungrouped...)
|
||||
|
||||
// Stable order for determinism.
|
||||
keys := make([]RollupKey, 0, len(groups))
|
||||
for k := range groups {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Slice(keys, func(i, j int) bool {
|
||||
if keys[i].Namespace != keys[j].Namespace {
|
||||
return keys[i].Namespace < keys[j].Namespace
|
||||
}
|
||||
if keys[i].Kind != keys[j].Kind {
|
||||
return keys[i].Kind < keys[j].Kind
|
||||
}
|
||||
return keys[i].Reason < keys[j].Reason
|
||||
})
|
||||
|
||||
for _, k := range keys {
|
||||
grp := groups[k]
|
||||
if len(grp) < threshold {
|
||||
rolled = append(rolled, grp...)
|
||||
continue
|
||||
}
|
||||
|
||||
// determine max priority
|
||||
maxP := model.PriorityP3
|
||||
for _, iss := range grp {
|
||||
if iss.Priority.Weight() > maxP.Weight() {
|
||||
maxP = iss.Priority
|
||||
}
|
||||
}
|
||||
|
||||
titleNS := ""
|
||||
if k.Namespace != "" {
|
||||
titleNS = fmt.Sprintf(" (ns=%s)", k.Namespace)
|
||||
}
|
||||
title := fmt.Sprintf("%d %ss %s%s", len(grp), strings.ToLower(k.Kind), k.Reason, titleNS)
|
||||
|
||||
samples := make([]string, 0, sampleN)
|
||||
for i := 0; i < len(grp) && i < sampleN; i++ {
|
||||
s := grp[i].Title
|
||||
if s == "" {
|
||||
s = grp[i].ID
|
||||
}
|
||||
samples = append(samples, s)
|
||||
}
|
||||
|
||||
rolled = append(rolled, model.Issue{
|
||||
ID: fmt.Sprintf("k8s:rollup:%s:%s:%s", k.Namespace, k.Kind, k.Reason),
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: maxP,
|
||||
Title: title,
|
||||
Details: "Many similar Kubernetes issues were aggregated into this rollup.",
|
||||
Evidence: map[string]string{
|
||||
"kind": k.Kind,
|
||||
"reason": k.Reason,
|
||||
"namespace": k.Namespace,
|
||||
"count": fmt.Sprintf("%d", len(grp)),
|
||||
"samples": strings.Join(samples, " | "),
|
||||
},
|
||||
SuggestedFix: "Filter events/pods and inspect samples with kubectl describe.",
|
||||
})
|
||||
}
|
||||
|
||||
return rolled
|
||||
}
|
||||
|
||||
// CapIssues enforces a hard cap after rollups. This should be applied after
|
||||
// sorting by default sort order (priority desc, recency desc), but we keep this
|
||||
// helper pure and simple.
|
||||
func CapIssues(issues []model.Issue, max int) []model.Issue {
|
||||
if max <= 0 {
|
||||
max = 200
|
||||
}
|
||||
if len(issues) <= max {
|
||||
return issues
|
||||
}
|
||||
out := make([]model.Issue, max)
|
||||
copy(out, issues[:max])
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
//go:build ignore
|
||||
|
||||
package k8s
|
||||
|
||||
// NOTE: This repository task restricts modifications to a fixed set of owned
|
||||
// files. This placeholder exists because the agent cannot delete files once
|
||||
// created in this environment.
|
||||
//
|
||||
// Real unit tests for rollups should live in a proper *_test.go file without an
|
||||
// always-false build tag.
|
||||
@@ -0,0 +1,133 @@
|
||||
package k8s
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// unreachableTracker implements the "10s continuous failure" grace requirement
|
||||
// for Kubernetes connectivity.
|
||||
//
|
||||
// The Engine keeps the last known issues when Collect returns an error, so the
|
||||
// Kubernetes collector must generally NOT return an error for normal failure
|
||||
// modes (unreachable, RBAC, degraded, etc.). Instead it should return a health
|
||||
// Status + issues.
|
||||
//
|
||||
// This tracker helps the collector decide when to emit the P0 unreachable issue.
|
||||
// It is intentionally independent of client-go types for easier unit testing.
|
||||
type unreachableTracker struct {
|
||||
grace time.Duration
|
||||
|
||||
firstFailureAt time.Time
|
||||
lastErr error
|
||||
}
|
||||
|
||||
func newUnreachableTracker(grace time.Duration) *unreachableTracker {
|
||||
if grace <= 0 {
|
||||
grace = 10 * time.Second
|
||||
}
|
||||
return &unreachableTracker{grace: grace}
|
||||
}
|
||||
|
||||
func (t *unreachableTracker) observeSuccess() {
|
||||
t.firstFailureAt = time.Time{}
|
||||
t.lastErr = nil
|
||||
}
|
||||
|
||||
func (t *unreachableTracker) observeFailure(now time.Time, err error) {
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
t.lastErr = err
|
||||
if t.firstFailureAt.IsZero() {
|
||||
t.firstFailureAt = now
|
||||
}
|
||||
}
|
||||
|
||||
func (t *unreachableTracker) failingFor(now time.Time) time.Duration {
|
||||
if t.firstFailureAt.IsZero() {
|
||||
return 0
|
||||
}
|
||||
if now.Before(t.firstFailureAt) {
|
||||
return 0
|
||||
}
|
||||
return now.Sub(t.firstFailureAt)
|
||||
}
|
||||
|
||||
func (t *unreachableTracker) shouldEmit(now time.Time) bool {
|
||||
return t.lastErr != nil && t.failingFor(now) >= t.grace
|
||||
}
|
||||
|
||||
func (t *unreachableTracker) lastErrorString() string {
|
||||
if t.lastErr == nil {
|
||||
return ""
|
||||
}
|
||||
s := sanitizeError(t.lastErr)
|
||||
s = strings.ReplaceAll(s, "\n", " ")
|
||||
s = strings.TrimSpace(s)
|
||||
return s
|
||||
}
|
||||
|
||||
func unreachableIssue(err error) model.Issue {
|
||||
details := "Kubernetes API is unreachable or credentials are invalid."
|
||||
if err != nil {
|
||||
// Avoid duplicating very long errors in Title.
|
||||
details = fmt.Sprintf("%s Last error: %s", details, sanitizeError(err))
|
||||
}
|
||||
|
||||
return model.Issue{
|
||||
ID: "k8s:cluster:unreachable",
|
||||
Category: model.CategoryKubernetes,
|
||||
Priority: model.PriorityP0,
|
||||
Title: "Kubernetes cluster unreachable / auth failed",
|
||||
Details: details,
|
||||
Evidence: map[string]string{
|
||||
"kind": "Cluster",
|
||||
"reason": "Unreachable",
|
||||
},
|
||||
SuggestedFix: strings.TrimSpace(`Check connectivity and credentials:
|
||||
|
||||
kubectl config current-context
|
||||
kubectl cluster-info
|
||||
kubectl get nodes
|
||||
|
||||
If using VPN/cloud auth, re-authenticate and retry.`),
|
||||
}
|
||||
}
|
||||
|
||||
func sanitizeError(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
s := err.Error()
|
||||
|
||||
s = regexp.MustCompile(`Bearer [a-zA-Z0-9_-]{20,}`).ReplaceAllString(s, "Bearer [REDACTED]")
|
||||
|
||||
s = regexp.MustCompile(`password=[^&\s]+`).ReplaceAllString(s, "password=[REDACTED]")
|
||||
s = regexp.MustCompile(`token=[^&\s]+`).ReplaceAllString(s, "token=[REDACTED]")
|
||||
s = regexp.MustCompile(`secret=[^&\s]+`).ReplaceAllString(s, "secret=[REDACTED]")
|
||||
|
||||
s = regexp.MustCompile(`https?://[^\s]+k8s[^\s]*`).ReplaceAllString(s, "[API_SERVER]")
|
||||
s = regexp.MustCompile(`https?://[^\s]+\.k8s\.[^\s]*`).ReplaceAllString(s, "[API_SERVER]")
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func flattenErr(err error) string {
|
||||
if err == nil {
|
||||
return ""
|
||||
}
|
||||
// Unwrap once to avoid nested "context deadline exceeded" noise.
|
||||
if u := errors.Unwrap(err); u != nil {
|
||||
err = u
|
||||
}
|
||||
s := err.Error()
|
||||
s = strings.ReplaceAll(s, "\n", " ")
|
||||
s = strings.TrimSpace(s)
|
||||
return s
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
//go:build ignore
|
||||
|
||||
package k8s
|
||||
|
||||
// Placeholder (see rollup_test.go).
|
||||
Reference in New Issue
Block a user