feat: implement ControlTower TUI for cluster and host monitoring

Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
2025-12-24 13:03:08 -08:00
parent c2c03fd664
commit 1421b4659e
40 changed files with 5941 additions and 0 deletions
--- a/internal/collectors/host/disk.go
+++ b/internal/collectors/host/disk.go
@@ -0,0 +1,287 @@
+package host
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
+	"time"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+// DiskCollector checks filesystem block + inode pressure across mounts.
+//
+// It reads /proc/mounts to discover mounts and then uses statfs to compute usage.
+// Pseudo filesystems are filtered out.
+//
+// Thresholds (PLAN.md):
+//   - P1 if blocks OR inodes >= 92%
+//   - P0 if blocks OR inodes >= 98%
+//
+// Issues are emitted per mount (one issue that includes both block+inode usage).
+//
+// NOTE: This collector is Linux-specific.
+type DiskCollector struct {
+	interval time.Duration
+
+	readFile func(string) ([]byte, error)
+	statfs   func(path string, st *syscall.Statfs_t) error
+}
+
+func NewDiskCollector() *DiskCollector {
+	return &DiskCollector{
+		interval: 10 * time.Second,
+		readFile: os.ReadFile,
+		statfs:   syscall.Statfs,
+	}
+}
+
+func (c *DiskCollector) Name() string { return "host:disk" }
+
+func (c *DiskCollector) Interval() time.Duration {
+	if c.interval <= 0 {
+		return 10 * time.Second
+	}
+	return c.interval
+}
+
+func (c *DiskCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
+	if err := ctx.Err(); err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+	}
+
+	b, err := c.readFile("/proc/mounts")
+	if err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/mounts"}, err
+	}
+
+	mounts := parseProcMounts(string(b))
+	if len(mounts) == 0 {
+		// Unusual but treat as degraded rather than hard error.
+		return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "no mounts found"}, nil
+	}
+
+	issues := make([]model.Issue, 0, 8)
+	seenMount := map[string]struct{}{}
+
+	partialErrs := 0
+	for _, m := range mounts {
+		if err := ctx.Err(); err != nil {
+			return issues, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+		}
+		if shouldSkipMount(m) {
+			continue
+		}
+		if _, ok := seenMount[m.MountPoint]; ok {
+			continue
+		}
+		seenMount[m.MountPoint] = struct{}{}
+
+		var st syscall.Statfs_t
+		if err := c.statfs(m.MountPoint, &st); err != nil {
+			partialErrs++
+			continue
+		}
+
+		blockPct, blockFreeBytes := statfsBlockUsedPct(st)
+		inodePct := statfsInodeUsedPct(st)
+
+		pri, ok := diskPriority(blockPct, inodePct)
+		if !ok {
+			continue
+		}
+
+		evidence := map[string]string{
+			"mount":            m.MountPoint,
+			"fstype":           m.FSType,
+			"block_used_pct":   fmt.Sprintf("%.1f", blockPct),
+			"block_free_bytes": strconv.FormatUint(blockFreeBytes, 10),
+		}
+		if inodePct >= 0 {
+			evidence["inode_used_pct"] = fmt.Sprintf("%.1f", inodePct)
+		}
+
+		issues = append(issues, model.Issue{
+			ID:       fmt.Sprintf("host:disk:%s:usage", m.MountPoint),
+			Category: model.CategoryStorage,
+			Priority: pri,
+			Title:    fmt.Sprintf("Disk usage high on %s", m.MountPoint),
+			Details:  "Filesystem space and/or inodes are nearly exhausted.",
+			Evidence: evidence,
+			SuggestedFix: fmt.Sprintf(
+				"Inspect usage:\n  df -h %s\n  df -i %s\nFind large directories:\n  sudo du -xh --max-depth=2 %s | sort -h | tail",
+				m.MountPoint, m.MountPoint, m.MountPoint,
+			),
+		})
+	}
+
+	st := collectors.OKStatus()
+	if partialErrs > 0 {
+		st.Health = collectors.HealthDegraded
+		st.Message = fmt.Sprintf("partial failures: %d mounts", partialErrs)
+	}
+	return issues, st, nil
+}
+
+type procMount struct {
+	Device     string
+	MountPoint string
+	FSType     string
+	Options    string
+}
+
+func parseProcMounts(content string) []procMount {
+	s := bufio.NewScanner(strings.NewReader(content))
+	out := make([]procMount, 0, 32)
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
+			continue
+		}
+		fields := strings.Fields(line)
+		if len(fields) < 3 {
+			continue
+		}
+		m := procMount{
+			Device:     unescapeProcMountsField(fields[0]),
+			MountPoint: unescapeProcMountsField(fields[1]),
+			FSType:     fields[2],
+		}
+		if len(fields) >= 4 {
+			m.Options = fields[3]
+		}
+		out = append(out, m)
+	}
+	return out
+}
+
+// /proc/mounts escapes special characters as octal sequences.
+// The most common one is a space as \040.
+func unescapeProcMountsField(s string) string {
+	replacer := strings.NewReplacer(
+		"\\040", " ",
+		"\\011", "\t",
+		"\\012", "\n",
+		"\\134", "\\",
+	)
+	return replacer.Replace(s)
+}
+
+var pseudoFSTypes = map[string]struct{}{
+	"proc":        {},
+	"sysfs":       {},
+	"tmpfs":       {},
+	"devtmpfs":    {},
+	"devpts":      {},
+	"cgroup":      {},
+	"cgroup2":     {},
+	"pstore":      {},
+	"securityfs":  {},
+	"debugfs":     {},
+	"tracefs":     {},
+	"configfs":    {},
+	"hugetlbfs":   {},
+	"mqueue":      {},
+	"rpc_pipefs":  {},
+	"fusectl":     {},
+	"binfmt_misc": {},
+	"autofs":      {},
+	"bpf":         {},
+	"ramfs":       {},
+	"nsfs":        {},
+	"efivarfs":    {},
+	"overlay":     {}, // common container overlay mounts
+
+	"squashfs":  {}, // typically read-only images
+	"selinuxfs": {},
+	"systemd-1": {},
+	"overlayfs": {}, // (non-standard) conservative skip
+
+	"cgroupfs":        {},
+	"procfs":          {},
+	"fuse.lxcfs":      {},
+	"fuse.gvfsd-fuse": {},
+}
+
+func shouldSkipMount(m procMount) bool {
+	if m.MountPoint == "" {
+		return true
+	}
+	// Filter by fstype.
+	if _, ok := pseudoFSTypes[m.FSType]; ok {
+		return true
+	}
+	// Filter common pseudo mountpoints.
+	if strings.HasPrefix(m.MountPoint, "/proc") || strings.HasPrefix(m.MountPoint, "/sys") {
+		return true
+	}
+	if strings.HasPrefix(m.MountPoint, "/dev") {
+		// /dev itself can be a real mount in some cases, but usually isn't useful for disk pressure.
+		return true
+	}
+	return false
+}
+
+func statfsBlockUsedPct(st syscall.Statfs_t) (usedPct float64, freeBytes uint64) {
+	// Mirror df(1) semantics closely:
+	// total = f_blocks
+	// used  = f_blocks - f_bfree
+	// avail = f_bavail (space available to unprivileged user)
+	// use%  = used / (used + avail)
+	if st.Blocks == 0 {
+		return 0, 0
+	}
+
+	bsize := uint64(st.Bsize)
+	blocks := uint64(st.Blocks)
+	bfree := uint64(st.Bfree)
+	bavail := uint64(st.Bavail)
+
+	usedBlocks := blocks - bfree
+	denom := usedBlocks + bavail
+	if denom == 0 {
+		return 0, 0
+	}
+
+	freeBytes = bavail * bsize
+	usedPct = (float64(usedBlocks) / float64(denom)) * 100.0
+	return usedPct, freeBytes
+}
+
+// statfsInodeUsedPct returns inode used percent. If inodes are unavailable (f_files==0), returns -1.
+func statfsInodeUsedPct(st syscall.Statfs_t) float64 {
+	if st.Files == 0 {
+		return -1
+	}
+	total := float64(st.Files)
+	free := float64(st.Ffree)
+	used := total - free
+	return (used / total) * 100.0
+}
+
+func diskPriority(blockPct, inodePct float64) (model.Priority, bool) {
+	maxPct := blockPct
+	if inodePct > maxPct {
+		maxPct = inodePct
+	}
+	// inodePct may be -1 if not supported; ignore in that case.
+	if inodePct < 0 {
+		maxPct = blockPct
+	}
+
+	switch {
+	case maxPct >= 98.0:
+		return model.PriorityP0, true
+	case maxPct >= 92.0:
+		return model.PriorityP1, true
+	default:
+		return "", false
+	}
+}
+
+var _ collectors.Collector = (*DiskCollector)(nil)
--- a/internal/collectors/host/disk_test.go
+++ b/internal/collectors/host/disk_test.go
@@ -0,0 +1,80 @@
+package host
+
+import (
+	"syscall"
+	"testing"
+)
+
+func TestParseProcMounts_UnescapesAndParses(t *testing.T) {
+	in := "dev1 / ext4 rw 0 0\n" +
+		"dev2 /path\\040with\\040space xfs rw 0 0\n" +
+		"badline\n"
+
+	ms := parseProcMounts(in)
+	if len(ms) != 2 {
+		t.Fatalf("expected 2 mounts, got %d", len(ms))
+	}
+	if ms[0].MountPoint != "/" || ms[0].FSType != "ext4" {
+		t.Fatalf("unexpected first mount: %+v", ms[0])
+	}
+	if ms[1].MountPoint != "/path with space" {
+		t.Fatalf("expected unescaped mountpoint, got %q", ms[1].MountPoint)
+	}
+}
+
+func TestShouldSkipMount_FiltersPseudo(t *testing.T) {
+	cases := []procMount{
+		{MountPoint: "/proc", FSType: "proc"},
+		{MountPoint: "/sys", FSType: "sysfs"},
+		{MountPoint: "/dev", FSType: "tmpfs"},
+		{MountPoint: "/dev/shm", FSType: "tmpfs"},
+	}
+	for _, c := range cases {
+		if !shouldSkipMount(c) {
+			t.Fatalf("expected skip for %+v", c)
+		}
+	}
+	if shouldSkipMount(procMount{MountPoint: "/home", FSType: "ext4"}) {
+		t.Fatalf("did not expect skip for /home ext4")
+	}
+}
+
+func TestDiskPriority(t *testing.T) {
+	if p, ok := diskPriority(91.9, -1); ok {
+		t.Fatalf("expected no issue, got %v", p)
+	}
+	if p, ok := diskPriority(92.0, -1); !ok || p != "P1" {
+		t.Fatalf("expected P1 at 92%%, got %v ok=%v", p, ok)
+	}
+	if p, ok := diskPriority(97.9, 98.0); !ok || p != "P0" {
+		t.Fatalf("expected P0 if either crosses 98%%, got %v ok=%v", p, ok)
+	}
+}
+
+func TestStatfsCalculations(t *testing.T) {
+	st := syscall.Statfs_t{}
+	st.Bsize = 1
+	st.Blocks = 100
+	st.Bfree = 8
+	st.Bavail = 8
+
+	pct, free := statfsBlockUsedPct(st)
+	if free != 8 {
+		t.Fatalf("expected free=8 bytes, got %d", free)
+	}
+	if pct < 91.9 || pct > 92.1 {
+		t.Fatalf("expected ~92%% used, got %f", pct)
+	}
+
+	st.Files = 100
+	st.Ffree = 2
+	ipct := statfsInodeUsedPct(st)
+	if ipct < 97.9 || ipct > 98.1 {
+		t.Fatalf("expected ~98%% inode used, got %f", ipct)
+	}
+
+	st.Files = 0
+	if statfsInodeUsedPct(st) != -1 {
+		t.Fatalf("expected -1 when inode info unavailable")
+	}
+}
--- a/internal/collectors/host/load.go
+++ b/internal/collectors/host/load.go
@@ -0,0 +1,127 @@
+package host
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+// LoadCollector evaluates 1-minute load average normalized by logical CPU count.
+//
+// Thresholds (PLAN.md), normalized by CPU count:
+//   - P2 if load1/cpus >= 4.0 sustained 120s
+//   - P1 if load1/cpus >= 6.0 sustained 120s
+//
+// NOTE: Linux-specific.
+// Thread-safe: Collect() can be called concurrently.
+type LoadCollector struct {
+	interval time.Duration
+
+	now      func() time.Time
+	readFile func(string) ([]byte, error)
+	cpuCount func() int
+
+	mu sync.Mutex
+
+	pri   model.Priority
+	since time.Time
+}
+
+func NewLoadCollector() *LoadCollector {
+	return &LoadCollector{
+		interval: 5 * time.Second,
+		now:      time.Now,
+		readFile: os.ReadFile,
+		cpuCount: runtime.NumCPU,
+	}
+}
+
+func (c *LoadCollector) Name() string { return "host:load" }
+
+func (c *LoadCollector) Interval() time.Duration {
+	if c.interval <= 0 {
+		return 5 * time.Second
+	}
+	return c.interval
+}
+
+func (c *LoadCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
+	if err := ctx.Err(); err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+	}
+
+	now := c.now()
+	b, err := c.readFile("/proc/loadavg")
+	if err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/loadavg"}, err
+	}
+
+	load1, err := parseProcLoadavgFirst(string(b))
+	if err != nil {
+		return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "bad /proc/loadavg"}, nil
+	}
+
+	cpus := c.cpuCount()
+	if cpus <= 0 {
+		cpus = 1
+	}
+	norm := load1 / float64(cpus)
+	desired, window := desiredLoadPriority(norm)
+	c.mu.Lock()
+	c.pri, c.since = updateSustained(now, c.pri, c.since, desired)
+	pri, since := c.pri, c.since
+	c.mu.Unlock()
+
+	if pri == "" || since.IsZero() || now.Sub(since) < window {
+		return nil, collectors.OKStatus(), nil
+	}
+
+	iss := model.Issue{
+		ID:       "host:load:high",
+		Category: model.CategoryPerformance,
+		Priority: pri,
+		Title:    "High sustained system load",
+		Details:  "The 1-minute load average is high relative to CPU count for a sustained period.",
+		Evidence: map[string]string{
+			"load1":            fmt.Sprintf("%.2f", load1),
+			"cpus":             strconv.Itoa(cpus),
+			"load1_per_cpu":    fmt.Sprintf("%.2f", norm),
+			"sustained_window": window.String(),
+		},
+		SuggestedFix: "Investigate CPU hogs:\n  top\n  ps -eo pid,ppid,cmd,%cpu --sort=-%cpu | head\nIf I/O bound (high iowait), check disk/network.\n",
+	}
+	return []model.Issue{iss}, collectors.OKStatus(), nil
+}
+
+func parseProcLoadavgFirst(content string) (float64, error) {
+	// /proc/loadavg format: "1.23 0.70 0.50 1/123 4567".
+	fields := strings.Fields(content)
+	if len(fields) < 1 {
+		return 0, fmt.Errorf("missing fields")
+	}
+	v, err := strconv.ParseFloat(fields[0], 64)
+	if err != nil {
+		return 0, err
+	}
+	return v, nil
+}
+
+func desiredLoadPriority(loadPerCPU float64) (model.Priority, time.Duration) {
+	if loadPerCPU >= 6.0 {
+		return model.PriorityP1, 120 * time.Second
+	}
+	if loadPerCPU >= 4.0 {
+		return model.PriorityP2, 120 * time.Second
+	}
+	return "", 0
+}
+
+var _ collectors.Collector = (*LoadCollector)(nil)
--- a/internal/collectors/host/load_test.go
+++ b/internal/collectors/host/load_test.go
@@ -0,0 +1,48 @@
+package host
+
+import (
+	"testing"
+	"time"
+
+	"tower/internal/model"
+)
+
+func TestParseProcLoadavgFirst(t *testing.T) {
+	v, err := parseProcLoadavgFirst("1.23 0.70 0.50 1/123 4567\n")
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	if v < 1.229 || v > 1.231 {
+		t.Fatalf("expected 1.23, got %v", v)
+	}
+	if _, err := parseProcLoadavgFirst("\n"); err == nil {
+		t.Fatalf("expected error")
+	}
+}
+
+func TestDesiredLoadPriority(t *testing.T) {
+	p, w := desiredLoadPriority(3.99)
+	if p != "" || w != 0 {
+		t.Fatalf("expected none")
+	}
+	p, w = desiredLoadPriority(4.0)
+	if p != model.PriorityP2 || w != 120*time.Second {
+		t.Fatalf("expected P2/120s")
+	}
+	p, w = desiredLoadPriority(6.0)
+	if p != model.PriorityP1 || w != 120*time.Second {
+		t.Fatalf("expected P1/120s")
+	}
+}
+
+func TestUpdateSustainedWorksForLoadToo(t *testing.T) {
+	now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+	p, since := updateSustained(now, "", time.Time{}, model.PriorityP2)
+	if p != model.PriorityP2 || !since.Equal(now) {
+		t.Fatalf("expected set")
+	}
+	p2, since2 := updateSustained(now.Add(10*time.Second), p, since, model.PriorityP2)
+	if p2 != model.PriorityP2 || !since2.Equal(since) {
+		t.Fatalf("expected unchanged")
+	}
+}
--- a/internal/collectors/host/mem.go
+++ b/internal/collectors/host/mem.go
@@ -0,0 +1,205 @@
+package host
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+// MemCollector checks MemAvailable and swap pressure from /proc/meminfo.
+//
+// Thresholds (PLAN.md):
+// Memory (MemAvailable as % of MemTotal):
+//   - P2 if <= 15% sustained 60s
+//   - P1 if <= 10% sustained 60s
+//   - P0 if <=  5% sustained 30s
+//
+// Swap pressure (only if RAM is also tight):
+//   - P1 if swap used >= 50% AND MemAvailable <= 10% sustained 60s
+//   - P0 if swap used >= 80% AND MemAvailable <=  5% sustained 30s
+//
+// Emits up to two issues:
+//   - host:mem:available
+//   - host:mem:swap
+//
+// NOTE: Linux-specific.
+// Thread-safe: Collect() can be called concurrently.
+type MemCollector struct {
+	interval time.Duration
+
+	now      func() time.Time
+	readFile func(string) ([]byte, error)
+
+	mu sync.Mutex
+
+	memPri   model.Priority
+	memSince time.Time
+
+	swapPri   model.Priority
+	swapSince time.Time
+}
+
+func NewMemCollector() *MemCollector {
+	return &MemCollector{
+		interval: 5 * time.Second,
+		now:      time.Now,
+		readFile: os.ReadFile,
+	}
+}
+
+func (c *MemCollector) Name() string { return "host:mem" }
+
+func (c *MemCollector) Interval() time.Duration {
+	if c.interval <= 0 {
+		return 5 * time.Second
+	}
+	return c.interval
+}
+
+func (c *MemCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
+	if err := ctx.Err(); err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+	}
+
+	now := c.now()
+	b, err := c.readFile("/proc/meminfo")
+	if err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/meminfo"}, err
+	}
+
+	mi := parseProcMeminfo(string(b))
+	memTotalKB, okT := mi["MemTotal"]
+	memAvailKB, okA := mi["MemAvailable"]
+	if !okT || !okA || memTotalKB <= 0 {
+		return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "missing MemTotal/MemAvailable"}, nil
+	}
+
+	memAvailPct := (float64(memAvailKB) / float64(memTotalKB)) * 100.0
+
+	desiredMemPri, memWindow := desiredMemPriority(memAvailPct)
+	c.mu.Lock()
+	c.memPri, c.memSince = updateSustained(now, c.memPri, c.memSince, desiredMemPri)
+	memPri, memSince := c.memPri, c.memSince
+	c.mu.Unlock()
+
+	issues := make([]model.Issue, 0, 2)
+	if memPri != "" && !memSince.IsZero() && now.Sub(memSince) >= memWindow {
+		issues = append(issues, model.Issue{
+			ID:       "host:mem:available",
+			Category: model.CategoryMemory,
+			Priority: memPri,
+			Title:    "Low available memory",
+			Details:  "MemAvailable is low and has remained low for a sustained period.",
+			Evidence: map[string]string{
+				"mem_available_kb":  strconv.FormatInt(memAvailKB, 10),
+				"mem_total_kb":      strconv.FormatInt(memTotalKB, 10),
+				"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
+			},
+			SuggestedFix: "Identify memory hogs:\n  free -h\n  ps aux --sort=-rss | head\nConsider restarting runaway processes or adding RAM.",
+		})
+	}
+
+	swapTotalKB, okST := mi["SwapTotal"]
+	swapFreeKB, okSF := mi["SwapFree"]
+	swapUsedPct := 0.0
+	if okST && okSF && swapTotalKB > 0 {
+		swapUsedKB := swapTotalKB - swapFreeKB
+		swapUsedPct = (float64(swapUsedKB) / float64(swapTotalKB)) * 100.0
+	}
+
+	desiredSwapPri, swapWindow := desiredSwapPriority(memAvailPct, swapTotalKB, swapUsedPct)
+	c.mu.Lock()
+	c.swapPri, c.swapSince = updateSustained(now, c.swapPri, c.swapSince, desiredSwapPri)
+	swapPri, swapSince := c.swapPri, c.swapSince
+	c.mu.Unlock()
+	if swapPri != "" && !swapSince.IsZero() && now.Sub(swapSince) >= swapWindow {
+		issues = append(issues, model.Issue{
+			ID:       "host:mem:swap",
+			Category: model.CategoryMemory,
+			Priority: swapPri,
+			Title:    "High swap usage with low RAM",
+			Details:  "Swap usage is high while available RAM is also low, indicating memory pressure.",
+			Evidence: map[string]string{
+				"swap_used_pct":     fmt.Sprintf("%.1f", swapUsedPct),
+				"swap_total_kb":     strconv.FormatInt(swapTotalKB, 10),
+				"mem_available_pct": fmt.Sprintf("%.1f", memAvailPct),
+			},
+			SuggestedFix: "Find swapping processes:\n  vmstat 1\n  smem -r 2>/dev/null || true\nConsider reducing memory usage or increasing RAM/swap.",
+		})
+	}
+
+	return issues, collectors.OKStatus(), nil
+}
+
+func parseProcMeminfo(content string) map[string]int64 {
+	out := map[string]int64{}
+	s := bufio.NewScanner(strings.NewReader(content))
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
+			continue
+		}
+		// Example: "MemAvailable:  12345 kB"
+		fields := strings.Fields(line)
+		if len(fields) < 2 {
+			continue
+		}
+		key := strings.TrimSuffix(fields[0], ":")
+		v, err := strconv.ParseInt(fields[1], 10, 64)
+		if err != nil {
+			continue
+		}
+		out[key] = v
+	}
+	return out
+}
+
+func desiredMemPriority(memAvailPct float64) (model.Priority, time.Duration) {
+	switch {
+	case memAvailPct <= 5.0:
+		return model.PriorityP0, 30 * time.Second
+	case memAvailPct <= 10.0:
+		return model.PriorityP1, 60 * time.Second
+	case memAvailPct <= 15.0:
+		return model.PriorityP2, 60 * time.Second
+	default:
+		return "", 0
+	}
+}
+
+func desiredSwapPriority(memAvailPct float64, swapTotalKB int64, swapUsedPct float64) (model.Priority, time.Duration) {
+	if swapTotalKB <= 0 {
+		return "", 0
+	}
+	// Only alert on swap when RAM is also tight.
+	switch {
+	case swapUsedPct >= 80.0 && memAvailPct <= 5.0:
+		return model.PriorityP0, 30 * time.Second
+	case swapUsedPct >= 50.0 && memAvailPct <= 10.0:
+		return model.PriorityP1, 60 * time.Second
+	default:
+		return "", 0
+	}
+}
+
+// updateSustained updates current severity and its since timestamp.
+// If desired is empty, it clears the state.
+func updateSustained(now time.Time, current model.Priority, since time.Time, desired model.Priority) (model.Priority, time.Time) {
+	if desired == "" {
+		return "", time.Time{}
+	}
+	if current != desired || since.IsZero() {
+		return desired, now
+	}
+	return current, since
+}
+
+var _ collectors.Collector = (*MemCollector)(nil)
--- a/internal/collectors/host/mem_test.go
+++ b/internal/collectors/host/mem_test.go
@@ -0,0 +1,83 @@
+package host
+
+import (
+	"testing"
+	"time"
+
+	"tower/internal/model"
+)
+
+func TestParseProcMeminfo(t *testing.T) {
+	in := "MemTotal:       8000000 kB\nMemAvailable:  800000 kB\nSwapTotal:  2000000 kB\nSwapFree:  500000 kB\n"
+	m := parseProcMeminfo(in)
+	if m["MemTotal"] != 8000000 {
+		t.Fatalf("MemTotal mismatch: %d", m["MemTotal"])
+	}
+	if m["MemAvailable"] != 800000 {
+		t.Fatalf("MemAvailable mismatch: %d", m["MemAvailable"])
+	}
+}
+
+func TestDesiredMemPriority(t *testing.T) {
+	p, w := desiredMemPriority(16.0)
+	if p != "" || w != 0 {
+		t.Fatalf("expected none")
+	}
+
+	p, w = desiredMemPriority(15.0)
+	if p != model.PriorityP2 || w != 60*time.Second {
+		t.Fatalf("expected P2/60s got %v/%v", p, w)
+	}
+	p, w = desiredMemPriority(10.0)
+	if p != model.PriorityP1 {
+		t.Fatalf("expected P1 got %v", p)
+	}
+	p, w = desiredMemPriority(5.0)
+	if p != model.PriorityP0 || w != 30*time.Second {
+		t.Fatalf("expected P0/30s got %v/%v", p, w)
+	}
+}
+
+func TestDesiredSwapPriority(t *testing.T) {
+	// No swap configured.
+	p, _ := desiredSwapPriority(4.0, 0, 90.0)
+	if p != "" {
+		t.Fatalf("expected none when SwapTotal=0")
+	}
+
+	p, w := desiredSwapPriority(4.0, 1000, 80.0)
+	if p != model.PriorityP0 || w != 30*time.Second {
+		t.Fatalf("expected P0/30s got %v/%v", p, w)
+	}
+
+	p, w = desiredSwapPriority(9.9, 1000, 50.0)
+	if p != model.PriorityP1 || w != 60*time.Second {
+		t.Fatalf("expected P1/60s got %v/%v", p, w)
+	}
+
+	// Swap high but RAM not tight => no issue.
+	p, _ = desiredSwapPriority(20.0, 1000, 90.0)
+	if p != "" {
+		t.Fatalf("expected none when RAM not tight")
+	}
+}
+
+func TestUpdateSustained(t *testing.T) {
+	now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC)
+	p, since := updateSustained(now, "", time.Time{}, model.PriorityP1)
+	if p != model.PriorityP1 || !since.Equal(now) {
+		t.Fatalf("expected set to P1 at now")
+	}
+	p2, since2 := updateSustained(now.Add(1*time.Second), p, since, model.PriorityP1)
+	if p2 != model.PriorityP1 || !since2.Equal(since) {
+		t.Fatalf("expected unchanged since")
+	}
+	p3, since3 := updateSustained(now.Add(2*time.Second), p2, since2, model.PriorityP0)
+	if p3 != model.PriorityP0 || !since3.Equal(now.Add(2*time.Second)) {
+		t.Fatalf("expected reset on priority change")
+	}
+	p4, since4 := updateSustained(now.Add(3*time.Second), p3, since3, "")
+	if p4 != "" || !since4.IsZero() {
+		t.Fatalf("expected cleared")
+	}
+}
--- a/internal/collectors/host/net.go
+++ b/internal/collectors/host/net.go
@@ -0,0 +1,138 @@
+package host
+
+import (
+	"bufio"
+	"context"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"tower/internal/collectors"
+	"tower/internal/model"
+)
+
+// NetCollector checks for missing default route while at least one non-loopback
+// interface is up.
+//
+// Rule (PLAN.md):
+//   - P1 if no default route AND any non-loopback interface is UP.
+//
+// Discovery:
+//   - Default route from /proc/net/route
+//   - Interface UP from /sys/class/net/*/operstate
+//
+// NOTE: Linux-specific.
+type NetCollector struct {
+	interval time.Duration
+
+	readFile func(string) ([]byte, error)
+	glob     func(string) ([]string, error)
+}
+
+func NewNetCollector() *NetCollector {
+	return &NetCollector{
+		interval: 5 * time.Second,
+		readFile: os.ReadFile,
+		glob:     filepath.Glob,
+	}
+}
+
+func (c *NetCollector) Name() string { return "host:net" }
+
+func (c *NetCollector) Interval() time.Duration {
+	if c.interval <= 0 {
+		return 5 * time.Second
+	}
+	return c.interval
+}
+
+func (c *NetCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
+	if err := ctx.Err(); err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+	}
+
+	routeBytes, err := c.readFile("/proc/net/route")
+	if err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/net/route"}, err
+	}
+
+	hasDefault := hasDefaultRoute(string(routeBytes))
+
+	paths, err := c.glob("/sys/class/net/*/operstate")
+	if err != nil {
+		return nil, collectors.Status{Health: collectors.HealthError, Message: "failed listing /sys/class/net"}, err
+	}
+	upIfaces := make([]string, 0, 2)
+	for _, p := range paths {
+		if err := ctx.Err(); err != nil {
+			return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
+		}
+		b, err := c.readFile(p)
+		if err != nil {
+			continue
+		}
+		iface := filepath.Base(filepath.Dir(p))
+		if iface == "lo" {
+			continue
+		}
+		state := strings.TrimSpace(string(b))
+		if isIfaceUp(state) {
+			upIfaces = append(upIfaces, iface)
+		}
+	}
+
+	if hasDefault || len(upIfaces) == 0 {
+		return nil, collectors.OKStatus(), nil
+	}
+
+	iss := model.Issue{
+		ID:       "host:net:default-route-missing",
+		Category: model.CategoryNetwork,
+		Priority: model.PriorityP1,
+		Title:    "No default route",
+		Details:  "At least one network interface is up, but no default route is present.",
+		Evidence: map[string]string{
+			"up_ifaces": strings.Join(upIfaces, ","),
+		},
+		SuggestedFix: "Check routing and link state:\n  ip route\n  ip link\n  nmcli dev status\nIf on Wi-Fi, reconnect; if on VPN, verify tunnel routes.",
+	}
+	return []model.Issue{iss}, collectors.OKStatus(), nil
+}
+
+func hasDefaultRoute(procNetRoute string) bool {
+	// /proc/net/route header:
+	// Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT
+	// Default route has Destination == 00000000.
+	s := bufio.NewScanner(strings.NewReader(procNetRoute))
+	first := true
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		if line == "" {
+			continue
+		}
+		if first {
+			first = false
+			// skip header if present
+			if strings.HasPrefix(line, "Iface") {
+				continue
+			}
+		}
+		fields := strings.Fields(line)
+		if len(fields) < 2 {
+			continue
+		}
+		if fields[1] == "00000000" {
+			return true
+		}
+	}
+	return false
+}
+
+func isIfaceUp(operstate string) bool {
+	// Linux operstate values include: up, down, unknown, dormant, lowerlayerdown.
+	s := strings.ToLower(strings.TrimSpace(operstate))
+	return s == "up" || s == "unknown"
+}
+
+var _ collectors.Collector = (*NetCollector)(nil)
--- a/internal/collectors/host/net_test.go
+++ b/internal/collectors/host/net_test.go
@@ -0,0 +1,28 @@
+package host
+
+import "testing"
+
+func TestHasDefaultRoute(t *testing.T) {
+	in := "Iface\tDestination\tGateway\tFlags\n" +
+		"eth0\t00000000\t0102A8C0\t0003\n"
+	if !hasDefaultRoute(in) {
+		t.Fatalf("expected default route")
+	}
+	in2 := "Iface Destination Gateway Flags\n" +
+		"eth0 0010A8C0 00000000 0001\n"
+	if hasDefaultRoute(in2) {
+		t.Fatalf("expected no default route")
+	}
+}
+
+func TestIsIfaceUp(t *testing.T) {
+	if !isIfaceUp("up\n") {
+		t.Fatalf("expected true")
+	}
+	if !isIfaceUp("unknown") {
+		t.Fatalf("expected true for unknown")
+	}
+	if isIfaceUp("down") {
+		t.Fatalf("expected false")
+	}
+}