feat: implement ControlTower TUI for cluster and host monitoring
Add complete TUI application for monitoring Kubernetes clusters and host systems. Features include: Core features: - Collector framework with concurrent scheduling - Host collectors: disk, memory, load, network - Kubernetes collectors: pods, nodes, workloads, events with informers - Issue deduplication, state management, and resolve-after logic - Bubble Tea TUI with table view, details pane, and filtering - JSON export functionality UX improvements: - Help overlay with keybindings - Priority/category filters with visual indicators - Direct priority jump (0/1/2/3) - Bulk acknowledge (Shift+A) - Clipboard copy (y) - Theme toggle (T) - Age format toggle (d) - Wide title toggle (t) - Vi-style navigation (j/k) - Home/End jump (g/G) - Rollup drill-down in details Robustness: - Grace period for unreachable clusters - Rollups for high-volume issues - Flap suppression - RBAC error handling Files: All core application code with tests for host collectors, engine, store, model, and export packages.
This commit is contained in:
287
internal/collectors/host/disk.go
Normal file
287
internal/collectors/host/disk.go
Normal file
@@ -0,0 +1,287 @@
|
||||
package host
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"tower/internal/collectors"
|
||||
"tower/internal/model"
|
||||
)
|
||||
|
||||
// DiskCollector checks filesystem block + inode pressure across mounts.
|
||||
//
|
||||
// It reads /proc/mounts to discover mounts and then uses statfs to compute usage.
|
||||
// Pseudo filesystems are filtered out.
|
||||
//
|
||||
// Thresholds (PLAN.md):
|
||||
// - P1 if blocks OR inodes >= 92%
|
||||
// - P0 if blocks OR inodes >= 98%
|
||||
//
|
||||
// Issues are emitted per mount (one issue that includes both block+inode usage).
|
||||
//
|
||||
// NOTE: This collector is Linux-specific.
|
||||
type DiskCollector struct {
|
||||
interval time.Duration
|
||||
|
||||
readFile func(string) ([]byte, error)
|
||||
statfs func(path string, st *syscall.Statfs_t) error
|
||||
}
|
||||
|
||||
func NewDiskCollector() *DiskCollector {
|
||||
return &DiskCollector{
|
||||
interval: 10 * time.Second,
|
||||
readFile: os.ReadFile,
|
||||
statfs: syscall.Statfs,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *DiskCollector) Name() string { return "host:disk" }
|
||||
|
||||
func (c *DiskCollector) Interval() time.Duration {
|
||||
if c.interval <= 0 {
|
||||
return 10 * time.Second
|
||||
}
|
||||
return c.interval
|
||||
}
|
||||
|
||||
func (c *DiskCollector) Collect(ctx context.Context) ([]model.Issue, collectors.Status, error) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
|
||||
b, err := c.readFile("/proc/mounts")
|
||||
if err != nil {
|
||||
return nil, collectors.Status{Health: collectors.HealthError, Message: "failed reading /proc/mounts"}, err
|
||||
}
|
||||
|
||||
mounts := parseProcMounts(string(b))
|
||||
if len(mounts) == 0 {
|
||||
// Unusual but treat as degraded rather than hard error.
|
||||
return nil, collectors.Status{Health: collectors.HealthDegraded, Message: "no mounts found"}, nil
|
||||
}
|
||||
|
||||
issues := make([]model.Issue, 0, 8)
|
||||
seenMount := map[string]struct{}{}
|
||||
|
||||
partialErrs := 0
|
||||
for _, m := range mounts {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return issues, collectors.Status{Health: collectors.HealthError, Message: "canceled"}, err
|
||||
}
|
||||
if shouldSkipMount(m) {
|
||||
continue
|
||||
}
|
||||
if _, ok := seenMount[m.MountPoint]; ok {
|
||||
continue
|
||||
}
|
||||
seenMount[m.MountPoint] = struct{}{}
|
||||
|
||||
var st syscall.Statfs_t
|
||||
if err := c.statfs(m.MountPoint, &st); err != nil {
|
||||
partialErrs++
|
||||
continue
|
||||
}
|
||||
|
||||
blockPct, blockFreeBytes := statfsBlockUsedPct(st)
|
||||
inodePct := statfsInodeUsedPct(st)
|
||||
|
||||
pri, ok := diskPriority(blockPct, inodePct)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
evidence := map[string]string{
|
||||
"mount": m.MountPoint,
|
||||
"fstype": m.FSType,
|
||||
"block_used_pct": fmt.Sprintf("%.1f", blockPct),
|
||||
"block_free_bytes": strconv.FormatUint(blockFreeBytes, 10),
|
||||
}
|
||||
if inodePct >= 0 {
|
||||
evidence["inode_used_pct"] = fmt.Sprintf("%.1f", inodePct)
|
||||
}
|
||||
|
||||
issues = append(issues, model.Issue{
|
||||
ID: fmt.Sprintf("host:disk:%s:usage", m.MountPoint),
|
||||
Category: model.CategoryStorage,
|
||||
Priority: pri,
|
||||
Title: fmt.Sprintf("Disk usage high on %s", m.MountPoint),
|
||||
Details: "Filesystem space and/or inodes are nearly exhausted.",
|
||||
Evidence: evidence,
|
||||
SuggestedFix: fmt.Sprintf(
|
||||
"Inspect usage:\n df -h %s\n df -i %s\nFind large directories:\n sudo du -xh --max-depth=2 %s | sort -h | tail",
|
||||
m.MountPoint, m.MountPoint, m.MountPoint,
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
st := collectors.OKStatus()
|
||||
if partialErrs > 0 {
|
||||
st.Health = collectors.HealthDegraded
|
||||
st.Message = fmt.Sprintf("partial failures: %d mounts", partialErrs)
|
||||
}
|
||||
return issues, st, nil
|
||||
}
|
||||
|
||||
type procMount struct {
|
||||
Device string
|
||||
MountPoint string
|
||||
FSType string
|
||||
Options string
|
||||
}
|
||||
|
||||
func parseProcMounts(content string) []procMount {
|
||||
s := bufio.NewScanner(strings.NewReader(content))
|
||||
out := make([]procMount, 0, 32)
|
||||
for s.Scan() {
|
||||
line := strings.TrimSpace(s.Text())
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 3 {
|
||||
continue
|
||||
}
|
||||
m := procMount{
|
||||
Device: unescapeProcMountsField(fields[0]),
|
||||
MountPoint: unescapeProcMountsField(fields[1]),
|
||||
FSType: fields[2],
|
||||
}
|
||||
if len(fields) >= 4 {
|
||||
m.Options = fields[3]
|
||||
}
|
||||
out = append(out, m)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// /proc/mounts escapes special characters as octal sequences.
|
||||
// The most common one is a space as \040.
|
||||
func unescapeProcMountsField(s string) string {
|
||||
replacer := strings.NewReplacer(
|
||||
"\\040", " ",
|
||||
"\\011", "\t",
|
||||
"\\012", "\n",
|
||||
"\\134", "\\",
|
||||
)
|
||||
return replacer.Replace(s)
|
||||
}
|
||||
|
||||
var pseudoFSTypes = map[string]struct{}{
|
||||
"proc": {},
|
||||
"sysfs": {},
|
||||
"tmpfs": {},
|
||||
"devtmpfs": {},
|
||||
"devpts": {},
|
||||
"cgroup": {},
|
||||
"cgroup2": {},
|
||||
"pstore": {},
|
||||
"securityfs": {},
|
||||
"debugfs": {},
|
||||
"tracefs": {},
|
||||
"configfs": {},
|
||||
"hugetlbfs": {},
|
||||
"mqueue": {},
|
||||
"rpc_pipefs": {},
|
||||
"fusectl": {},
|
||||
"binfmt_misc": {},
|
||||
"autofs": {},
|
||||
"bpf": {},
|
||||
"ramfs": {},
|
||||
"nsfs": {},
|
||||
"efivarfs": {},
|
||||
"overlay": {}, // common container overlay mounts
|
||||
|
||||
"squashfs": {}, // typically read-only images
|
||||
"selinuxfs": {},
|
||||
"systemd-1": {},
|
||||
"overlayfs": {}, // (non-standard) conservative skip
|
||||
|
||||
"cgroupfs": {},
|
||||
"procfs": {},
|
||||
"fuse.lxcfs": {},
|
||||
"fuse.gvfsd-fuse": {},
|
||||
}
|
||||
|
||||
func shouldSkipMount(m procMount) bool {
|
||||
if m.MountPoint == "" {
|
||||
return true
|
||||
}
|
||||
// Filter by fstype.
|
||||
if _, ok := pseudoFSTypes[m.FSType]; ok {
|
||||
return true
|
||||
}
|
||||
// Filter common pseudo mountpoints.
|
||||
if strings.HasPrefix(m.MountPoint, "/proc") || strings.HasPrefix(m.MountPoint, "/sys") {
|
||||
return true
|
||||
}
|
||||
if strings.HasPrefix(m.MountPoint, "/dev") {
|
||||
// /dev itself can be a real mount in some cases, but usually isn't useful for disk pressure.
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func statfsBlockUsedPct(st syscall.Statfs_t) (usedPct float64, freeBytes uint64) {
|
||||
// Mirror df(1) semantics closely:
|
||||
// total = f_blocks
|
||||
// used = f_blocks - f_bfree
|
||||
// avail = f_bavail (space available to unprivileged user)
|
||||
// use% = used / (used + avail)
|
||||
if st.Blocks == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
bsize := uint64(st.Bsize)
|
||||
blocks := uint64(st.Blocks)
|
||||
bfree := uint64(st.Bfree)
|
||||
bavail := uint64(st.Bavail)
|
||||
|
||||
usedBlocks := blocks - bfree
|
||||
denom := usedBlocks + bavail
|
||||
if denom == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
freeBytes = bavail * bsize
|
||||
usedPct = (float64(usedBlocks) / float64(denom)) * 100.0
|
||||
return usedPct, freeBytes
|
||||
}
|
||||
|
||||
// statfsInodeUsedPct returns inode used percent. If inodes are unavailable (f_files==0), returns -1.
|
||||
func statfsInodeUsedPct(st syscall.Statfs_t) float64 {
|
||||
if st.Files == 0 {
|
||||
return -1
|
||||
}
|
||||
total := float64(st.Files)
|
||||
free := float64(st.Ffree)
|
||||
used := total - free
|
||||
return (used / total) * 100.0
|
||||
}
|
||||
|
||||
func diskPriority(blockPct, inodePct float64) (model.Priority, bool) {
|
||||
maxPct := blockPct
|
||||
if inodePct > maxPct {
|
||||
maxPct = inodePct
|
||||
}
|
||||
// inodePct may be -1 if not supported; ignore in that case.
|
||||
if inodePct < 0 {
|
||||
maxPct = blockPct
|
||||
}
|
||||
|
||||
switch {
|
||||
case maxPct >= 98.0:
|
||||
return model.PriorityP0, true
|
||||
case maxPct >= 92.0:
|
||||
return model.PriorityP1, true
|
||||
default:
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
|
||||
var _ collectors.Collector = (*DiskCollector)(nil)
|
||||
Reference in New Issue
Block a user