feat: complete agent monitoring - hook, UI, and backend filter
- Add event_type and framework filters to events query endpoint - Add /agents SPA route to web-ui server - Add Agents nav link and route in frontend - Add agents page CSS (timeline, VM pills, stats panel) - Build VM status strip, activity timeline, and real-time stats - Add agentmon hook for OpenClaw (HOOK.md + handler.ts) - Add docker-compose, Dockerfile, and supporting infra files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,353 @@
|
||||
package openclaw
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
guestDiskWarningPercent = 80.0
|
||||
guestDiskCriticalPercent = 95.0
|
||||
guestMemoryWarningPercent = 80.0
|
||||
guestMemoryCriticalPercent = 95.0
|
||||
hostDiskActualGB = 32.0
|
||||
backupWarningAgeHours = 25.0
|
||||
backupCriticalAgeHours = 48.0
|
||||
)
|
||||
|
||||
func CollectHostMetrics(domain string) (HostMetrics, error) {
|
||||
metrics := HostMetrics{}
|
||||
|
||||
state, err := virshCmd("domstate", domain)
|
||||
if err != nil {
|
||||
return metrics, fmt.Errorf("failed to get VM state: %w", err)
|
||||
}
|
||||
metrics.State = strings.TrimSpace(state)
|
||||
|
||||
if metrics.State != "running" {
|
||||
metrics.VCPUs = 0
|
||||
metrics.MemoryKiB = 0
|
||||
metrics.Autostart = false
|
||||
metrics.Snapshots = 0
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
info, err := virshCmd("dominfo", domain)
|
||||
if err != nil {
|
||||
return metrics, fmt.Errorf("failed to get VM info: %w", err)
|
||||
}
|
||||
|
||||
if cpuStr := parseVirshInfo(info, "CPU(s)"); cpuStr != "" {
|
||||
if cpu, err := strconv.Atoi(strings.TrimSpace(cpuStr)); err == nil {
|
||||
metrics.VCPUs = cpu
|
||||
}
|
||||
}
|
||||
if memStr := parseVirshInfo(info, "Max memory"); memStr != "" {
|
||||
if mem, err := parseMemoryKiB(memStr); err == nil {
|
||||
metrics.MemoryKiB = mem
|
||||
}
|
||||
}
|
||||
|
||||
autostartInfo := parseVirshInfo(info, "Autostart")
|
||||
metrics.Autostart = strings.TrimSpace(autostartInfo) == "enable"
|
||||
|
||||
cpuTime, err := virshCmd("dominfo", domain)
|
||||
if err == nil {
|
||||
if cpuStr := parseVirshInfo(cpuTime, "CPU time"); cpuStr != "" {
|
||||
if cpu, err := parseCPUTimeNS(cpuStr); err == nil {
|
||||
metrics.CPUTime = cpu
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
snapshots, err := virshCmd("snapshot-list", domain, "--name")
|
||||
if err == nil {
|
||||
metrics.Snapshots = len(strings.Fields(strings.TrimSpace(snapshots)))
|
||||
}
|
||||
|
||||
diskPath, err := virshCmd("domblklist", domain)
|
||||
if err == nil {
|
||||
lines := strings.Split(diskPath, "\n")
|
||||
for _, line := range lines {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 && fields[1] != "" && fields[1] != "-" {
|
||||
diskActual, _, err := getDiskStats(fields[1])
|
||||
if err == nil {
|
||||
metrics.DiskActual = diskActual
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func CollectGuestMetrics(host, user string) (*GuestMetrics, error) {
|
||||
metrics := &GuestMetrics{}
|
||||
|
||||
serviceStatus, err := sshCmd(host, user, "systemctl --user is-active openclaw-gateway.service")
|
||||
if err == nil {
|
||||
metrics.ServiceActive = strings.TrimSpace(serviceStatus) == "active"
|
||||
}
|
||||
|
||||
if metrics.ServiceActive {
|
||||
uptime, err := sshCmd(host, user, "systemctl --user show openclaw-gateway.service -p ActiveEnterTimestamp --value")
|
||||
if err == nil {
|
||||
uptimeTS := strings.TrimSpace(uptime)
|
||||
if ts, err := time.Parse("Mon 2006-01-02 15:04:05 MST", uptimeTS); err == nil {
|
||||
duration := time.Since(ts)
|
||||
metrics.ServiceUptime = duration.String()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
httpCode, err := sshCmd(host, user, "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:18789/")
|
||||
if err == nil {
|
||||
if code, err := strconv.Atoi(strings.TrimSpace(httpCode)); err == nil {
|
||||
metrics.HTTPStatus = code
|
||||
}
|
||||
}
|
||||
|
||||
version, err := sshCmd(host, user, "ls -la ~/.local/share/pnpm/5/node_modules/openclaw | grep -oP 'openclaw@[0-9.]+' | head -1")
|
||||
if err == nil {
|
||||
metrics.Version = strings.TrimSpace(strings.TrimPrefix(version, "openclaw@"))
|
||||
if metrics.Version != "" {
|
||||
serviceVersion, err := sshCmd(host, user, "grep OPENCLAW_SERVICE_VERSION ~/.config/systemd/user/openclaw-gateway.service 2>/dev/null | head -1")
|
||||
if err == nil && strings.Contains(serviceVersion, metrics.Version) {
|
||||
metrics.VersionConsistent = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
memInfo, err := sshCmd(host, user, "free -b | grep '^Mem:'")
|
||||
if err == nil {
|
||||
fields := strings.Fields(memInfo)
|
||||
if len(fields) >= 3 {
|
||||
if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
|
||||
metrics.MemoryTotal = total
|
||||
}
|
||||
if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil {
|
||||
metrics.MemoryUsed = used
|
||||
}
|
||||
if metrics.MemoryTotal > 0 {
|
||||
metrics.MemoryPercent = float64(metrics.MemoryUsed) / float64(metrics.MemoryTotal) * 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
diskInfo, err := sshCmd(host, user, "df -B1 / | tail -1")
|
||||
if err == nil {
|
||||
fields := strings.Fields(diskInfo)
|
||||
if len(fields) >= 5 {
|
||||
if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
|
||||
metrics.DiskTotal = total
|
||||
}
|
||||
if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil {
|
||||
metrics.DiskUsed = used
|
||||
}
|
||||
if metrics.DiskTotal > 0 {
|
||||
metrics.DiskPercent = float64(metrics.DiskUsed) / float64(metrics.DiskTotal) * 100
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
loadAvg, err := sshCmd(host, user, "awk '{print $1}' /proc/loadavg")
|
||||
if err == nil {
|
||||
if load, err := strconv.ParseFloat(strings.TrimSpace(loadAvg), 64); err == nil {
|
||||
metrics.LoadAverage = load
|
||||
}
|
||||
}
|
||||
|
||||
swappiness, err := sshCmd(host, user, "cat /proc/sys/vm/swappiness")
|
||||
if err == nil {
|
||||
if swap, err := strconv.Atoi(strings.TrimSpace(swappiness)); err == nil {
|
||||
metrics.Swappiness = swap
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func CollectBackupStatus(instanceName string) (*BackupStatus, error) {
|
||||
backupPath := "/home/will/lab/swarm/openclaw"
|
||||
fileInfo, err := exec.Command("stat", "-c", "%Y", backupPath).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get backup timestamp: %w", err)
|
||||
}
|
||||
|
||||
timestampStr := strings.TrimSpace(string(fileInfo))
|
||||
timestamp, err := strconv.ParseInt(timestampStr, 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse backup timestamp: %w", err)
|
||||
}
|
||||
|
||||
lastBackup := time.Unix(timestamp, 0)
|
||||
age := time.Since(lastBackup)
|
||||
ageHours := age.Hours()
|
||||
|
||||
return &BackupStatus{
|
||||
LastBackup: lastBackup.UTC().Format(time.RFC3339),
|
||||
AgeHours: ageHours,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func DetectIssues(metrics Metrics) Issues {
|
||||
issues := Issues{}
|
||||
|
||||
if metrics.Guest != nil {
|
||||
if metrics.Guest.DiskPercent > guestDiskCriticalPercent {
|
||||
issues.GuestDiskUsageHigh = true
|
||||
}
|
||||
if metrics.Guest.MemoryPercent > guestMemoryCriticalPercent {
|
||||
issues.GuestMemoryUsageHigh = true
|
||||
}
|
||||
if !metrics.Guest.ServiceActive {
|
||||
issues.GatewayDown = true
|
||||
}
|
||||
if metrics.Guest.HTTPStatus != 200 {
|
||||
issues.HTTPUnhealthy = true
|
||||
}
|
||||
if metrics.Guest.Version != "" && !metrics.Guest.VersionConsistent {
|
||||
issues.VersionMismatch = true
|
||||
}
|
||||
}
|
||||
|
||||
if metrics.Instance.Status == "active" && metrics.Host.State != "running" {
|
||||
issues.VMNotRunning = true
|
||||
}
|
||||
|
||||
if metrics.Backup != nil && metrics.Backup.AgeHours > backupCriticalAgeHours {
|
||||
issues.BackupStale = true
|
||||
}
|
||||
|
||||
return issues
|
||||
}
|
||||
|
||||
func LoadInstances(registryPath string) ([]Instance, error) {
|
||||
data, err := exec.Command("cat", registryPath).Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read registry: %w", err)
|
||||
}
|
||||
|
||||
var registry struct {
|
||||
Instances []map[string]any `json:"instances"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(data, ®istry); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse registry: %w", err)
|
||||
}
|
||||
|
||||
instances := make([]Instance, 0, len(registry.Instances))
|
||||
for _, rawInst := range registry.Instances {
|
||||
inst := Instance{}
|
||||
|
||||
if name, ok := rawInst["name"].(string); ok {
|
||||
inst.Name = name
|
||||
}
|
||||
if domain, ok := rawInst["domain"].(string); ok {
|
||||
inst.Domain = domain
|
||||
}
|
||||
if host, ok := rawInst["host"].(string); ok && host != "" {
|
||||
inst.Host = &host
|
||||
}
|
||||
if user, ok := rawInst["user"].(string); ok {
|
||||
inst.User = user
|
||||
}
|
||||
if status, ok := rawInst["status"].(string); ok {
|
||||
inst.Status = status
|
||||
}
|
||||
|
||||
inst.Additional = rawInst
|
||||
|
||||
instances = append(instances, inst)
|
||||
}
|
||||
|
||||
return instances, nil
|
||||
}
|
||||
|
||||
func virshCmd(args ...string) (string, error) {
|
||||
cmd := exec.Command("virsh", append([]string{"-c", "qemu:///system"}, args...)...)
|
||||
output, err := cmd.CombinedOutput()
|
||||
return string(output), err
|
||||
}
|
||||
|
||||
func sshCmd(host, user, command string) (string, error) {
|
||||
cmd := exec.Command("ssh", "-o", "ConnectTimeout=5", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "LogLevel=ERROR", "-q", fmt.Sprintf("%s@%s", user, host), command)
|
||||
output, err := cmd.CombinedOutput()
|
||||
return string(output), err
|
||||
}
|
||||
|
||||
func parseVirshInfo(info, key string) string {
|
||||
re := regexp.MustCompile(fmt.Sprintf(`(?m)^%s:\s*(.*)$`, regexp.QuoteMeta(key)))
|
||||
match := re.FindStringSubmatch(info)
|
||||
if match != nil && len(match) > 1 {
|
||||
return match[1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseMemoryKiB(memStr string) (int64, error) {
|
||||
memStr = strings.TrimSpace(strings.ToLower(memStr))
|
||||
re := regexp.MustCompile(`^(\d+(?:\.\d+)?)\s*([kmgt]?)i?b$`)
|
||||
match := re.FindStringSubmatch(memStr)
|
||||
if match == nil || len(match) < 3 {
|
||||
return 0, fmt.Errorf("invalid memory format: %s", memStr)
|
||||
}
|
||||
|
||||
value, err := strconv.ParseFloat(match[1], 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("failed to parse memory value: %w", err)
|
||||
}
|
||||
|
||||
unit := match[2]
|
||||
multiplier := int64(1)
|
||||
switch unit {
|
||||
case "k":
|
||||
multiplier = 1
|
||||
case "m":
|
||||
multiplier = 1024
|
||||
case "g":
|
||||
multiplier = 1024 * 1024
|
||||
case "t":
|
||||
multiplier = 1024 * 1024 * 1024
|
||||
}
|
||||
|
||||
return int64(value * float64(multiplier)), nil
|
||||
}
|
||||
|
||||
func parseCPUTimeNS(cpuStr string) (int64, error) {
|
||||
parts := strings.Fields(cpuStr)
|
||||
if len(parts) < 4 {
|
||||
return 0, fmt.Errorf("invalid CPU time format")
|
||||
}
|
||||
|
||||
hours, _ := strconv.ParseFloat(parts[0], 64)
|
||||
minutes, _ := strconv.ParseFloat(parts[2], 64)
|
||||
seconds, _ := strconv.ParseFloat(strings.TrimSuffix(parts[4], "s"), 64)
|
||||
|
||||
totalSeconds := hours*3600 + minutes*60 + seconds
|
||||
return int64(totalSeconds * 1e9), nil
|
||||
}
|
||||
|
||||
func getDiskStats(path string) (actual, virtual int64, err error) {
|
||||
info, err := exec.Command("stat", "-c", "%s %b", path).Output()
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
fields := strings.Fields(string(info))
|
||||
if len(fields) < 2 {
|
||||
return 0, 0, fmt.Errorf("invalid stat output")
|
||||
}
|
||||
|
||||
blockSize, _ := strconv.ParseInt(fields[0], 10, 64)
|
||||
blockCount, _ := strconv.ParseInt(fields[1], 10, 64)
|
||||
actual = blockSize * blockCount
|
||||
|
||||
return actual, 0, nil
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
package openclaw
|
||||
|
||||
import "time"
|
||||
|
||||
type Instance struct {
|
||||
Name string `json:"name"`
|
||||
Domain string `json:"domain"`
|
||||
Host *string `json:"host,omitempty"`
|
||||
User string `json:"user"`
|
||||
Status string `json:"status"`
|
||||
|
||||
Additional map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
type HostMetrics struct {
|
||||
State string `json:"state"`
|
||||
VCPUs int `json:"vcpus"`
|
||||
MemoryKiB int64 `json:"memory_kib"`
|
||||
Autostart bool `json:"autostart"`
|
||||
Snapshots int `json:"snapshots"`
|
||||
DiskActual int64 `json:"disk_actual_bytes"`
|
||||
CPUTime int64 `json:"cpu_time_ns"`
|
||||
}
|
||||
|
||||
type GuestMetrics struct {
|
||||
ServiceActive bool `json:"service_active"`
|
||||
ServiceUptime string `json:"service_uptime"`
|
||||
HTTPStatus int `json:"http_status"`
|
||||
Version string `json:"version"`
|
||||
VersionConsistent bool `json:"version_consistent"`
|
||||
MemoryTotal int64 `json:"memory_total_bytes"`
|
||||
MemoryUsed int64 `json:"memory_used_bytes"`
|
||||
MemoryPercent float64 `json:"memory_percent"`
|
||||
DiskTotal int64 `json:"disk_total_bytes"`
|
||||
DiskUsed int64 `json:"disk_used_bytes"`
|
||||
DiskPercent float64 `json:"disk_percent"`
|
||||
LoadAverage float64 `json:"load_average"`
|
||||
Swappiness int `json:"swappiness"`
|
||||
}
|
||||
|
||||
type BackupStatus struct {
|
||||
LastBackup string `json:"last_backup"`
|
||||
AgeHours float64 `json:"age_hours"`
|
||||
}
|
||||
|
||||
type Metrics struct {
|
||||
Instance Instance `json:"instance"`
|
||||
Host HostMetrics `json:"host"`
|
||||
Guest *GuestMetrics `json:"guest,omitempty"`
|
||||
Backup *BackupStatus `json:"backup,omitempty"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type Issues struct {
|
||||
GuestDiskUsageHigh bool `json:"guest_disk_usage_high"`
|
||||
GuestMemoryUsageHigh bool `json:"guest_memory_usage_high"`
|
||||
HostDiskUsageHigh bool `json:"host_disk_usage_high"`
|
||||
GatewayDown bool `json:"gateway_down"`
|
||||
HTTPUnhealthy bool `json:"http_unhealthy"`
|
||||
VersionMismatch bool `json:"version_mismatch"`
|
||||
VMNotRunning bool `json:"vm_not_running"`
|
||||
BackupStale bool `json:"backup_stale"`
|
||||
}
|
||||
Reference in New Issue
Block a user