package openclaw import ( "encoding/json" "fmt" "os" "os/exec" "sort" "regexp" "strconv" "strings" "time" ) const ( guestDiskWarningPercent = 80.0 guestDiskCriticalPercent = 95.0 guestMemoryWarningPercent = 80.0 guestMemoryCriticalPercent = 95.0 hostDiskActualGB = 32.0 backupWarningAgeHours = 25.0 backupCriticalAgeHours = 48.0 ) func CollectHostMetrics(domain string) (HostMetrics, error) { metrics := HostMetrics{} state, err := virshCmd("domstate", domain) if err != nil { return metrics, fmt.Errorf("failed to get VM state: %w", err) } metrics.State = strings.TrimSpace(state) if metrics.State != "running" { metrics.VCPUs = 0 metrics.MemoryKiB = 0 metrics.Autostart = false metrics.Snapshots = 0 return metrics, nil } info, err := virshCmd("dominfo", domain) if err != nil { return metrics, fmt.Errorf("failed to get VM info: %w", err) } if cpuStr := parseVirshInfo(info, "CPU(s)"); cpuStr != "" { if cpu, err := strconv.Atoi(strings.TrimSpace(cpuStr)); err == nil { metrics.VCPUs = cpu } } if memStr := parseVirshInfo(info, "Max memory"); memStr != "" { if mem, err := parseMemoryKiB(memStr); err == nil { metrics.MemoryKiB = mem } } autostartInfo := parseVirshInfo(info, "Autostart") metrics.Autostart = strings.TrimSpace(autostartInfo) == "enable" cpuTime, err := virshCmd("dominfo", domain) if err == nil { if cpuStr := parseVirshInfo(cpuTime, "CPU time"); cpuStr != "" { if cpu, err := parseCPUTimeNS(cpuStr); err == nil { metrics.CPUTime = cpu } } } snapshots, err := virshCmd("snapshot-list", domain, "--name") if err == nil { metrics.Snapshots = len(strings.Fields(strings.TrimSpace(snapshots))) } diskPath, err := virshCmd("domblklist", domain) if err == nil { lines := strings.Split(diskPath, "\n") for _, line := range lines { fields := strings.Fields(line) if len(fields) >= 2 && fields[1] != "" && fields[1] != "-" { diskActual, _, err := getDiskStats(fields[1]) if err == nil { metrics.DiskActual = diskActual } } } } return metrics, nil } func CollectGuestMetrics(host, user string) (*GuestMetrics, error) { metrics := &GuestMetrics{} serviceStatus, err := sshCmd(host, user, "systemctl --user is-active openclaw-gateway.service") if err == nil { metrics.ServiceActive = strings.TrimSpace(serviceStatus) == "active" } if metrics.ServiceActive { uptime, err := sshCmd(host, user, "systemctl --user show openclaw-gateway.service -p ActiveEnterTimestamp --value") if err == nil { uptimeTS := strings.TrimSpace(uptime) if ts, err := time.Parse("Mon 2006-01-02 15:04:05 MST", uptimeTS); err == nil { duration := time.Since(ts) metrics.ServiceUptime = duration.String() } } } httpCode, err := sshCmd(host, user, "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:18789/") if err == nil { if code, err := strconv.Atoi(strings.TrimSpace(httpCode)); err == nil { metrics.HTTPStatus = code } } version, err := sshCmd(host, user, "ls -la ~/.local/share/pnpm/5/node_modules/openclaw | grep -oP 'openclaw@[0-9.]+' | head -1") if err == nil { metrics.Version = strings.TrimSpace(strings.TrimPrefix(version, "openclaw@")) if metrics.Version != "" { serviceVersion, err := sshCmd(host, user, "grep OPENCLAW_SERVICE_VERSION ~/.config/systemd/user/openclaw-gateway.service 2>/dev/null | head -1") if err == nil && strings.Contains(serviceVersion, metrics.Version) { metrics.VersionConsistent = true } } } memInfo, err := sshCmd(host, user, "free -b | grep '^Mem:'") if err == nil { fields := strings.Fields(memInfo) if len(fields) >= 3 { if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil { metrics.MemoryTotal = total } if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil { metrics.MemoryUsed = used } if metrics.MemoryTotal > 0 { metrics.MemoryPercent = float64(metrics.MemoryUsed) / float64(metrics.MemoryTotal) * 100 } } } diskInfo, err := sshCmd(host, user, "df -B1 / | tail -1") if err == nil { fields := strings.Fields(diskInfo) if len(fields) >= 5 { if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil { metrics.DiskTotal = total } if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil { metrics.DiskUsed = used } if metrics.DiskTotal > 0 { metrics.DiskPercent = float64(metrics.DiskUsed) / float64(metrics.DiskTotal) * 100 } } } loadAvg, err := sshCmd(host, user, "awk '{print $1}' /proc/loadavg") if err == nil { if load, err := strconv.ParseFloat(strings.TrimSpace(loadAvg), 64); err == nil { metrics.LoadAverage = load } } swappiness, err := sshCmd(host, user, "cat /proc/sys/vm/swappiness") if err == nil { if swap, err := strconv.Atoi(strings.TrimSpace(swappiness)); err == nil { metrics.Swappiness = swap } } return metrics, nil } func CollectBackupStatus(instanceName string) (*BackupStatus, error) { backupPath := "/home/will/lab/swarm/openclaw" fileInfo, err := exec.Command("stat", "-c", "%Y", backupPath).Output() if err != nil { return nil, fmt.Errorf("failed to get backup timestamp: %w", err) } timestampStr := strings.TrimSpace(string(fileInfo)) timestamp, err := strconv.ParseInt(timestampStr, 10, 64) if err != nil { return nil, fmt.Errorf("failed to parse backup timestamp: %w", err) } lastBackup := time.Unix(timestamp, 0) age := time.Since(lastBackup) ageHours := age.Hours() return &BackupStatus{ LastBackup: lastBackup.UTC().Format(time.RFC3339), AgeHours: ageHours, }, nil } func CollectMinIOMetrics(instanceName string) (*MinIOMetrics, error) { targetInstance := envDefault("OPENCLAW_MINIO_INSTANCE", "zap") if instanceName != targetInstance { return nil, nil } endpoint := envDefault("OPENCLAW_MINIO_ENDPOINT", "http://192.168.153.253:9000") bucket := envDefault("OPENCLAW_MINIO_BUCKET", "zap") prefix := envDefault("OPENCLAW_MINIO_PREFIX", "backups") metrics := &MinIOMetrics{ Endpoint: endpoint, Bucket: bucket, Prefix: prefix, } healthURL := strings.TrimRight(endpoint, "/") + "/minio/health/live" statusOutput, err := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "--connect-timeout", "5", healthURL).CombinedOutput() if err != nil { metrics.Error = fmt.Sprintf("health check failed: %v", err) return metrics, nil } if code, err := strconv.Atoi(strings.TrimSpace(string(statusOutput))); err == nil { metrics.HTTPStatus = code metrics.Reachable = code == 200 } listOutput, err := exec.Command( "aws", "--endpoint-url", endpoint, "s3api", "list-objects-v2", "--bucket", bucket, "--prefix", strings.Trim(prefix, "/")+"/", "--output", "json", ).CombinedOutput() if err != nil { msg := strings.TrimSpace(string(listOutput)) if msg == "" { msg = err.Error() } metrics.Error = msg return metrics, nil } var resp struct { Contents []struct { Key string `json:"Key"` Size int64 `json:"Size"` LastModified time.Time `json:"LastModified"` } `json:"Contents"` } if err := json.Unmarshal(listOutput, &resp); err != nil { metrics.Error = fmt.Sprintf("invalid usage response: %v", err) return metrics, nil } metrics.ObjectCount = len(resp.Contents) for _, obj := range resp.Contents { metrics.TotalBytes += obj.Size } if len(resp.Contents) > 0 { sort.Slice(resp.Contents, func(i, j int) bool { return resp.Contents[i].LastModified.After(resp.Contents[j].LastModified) }) metrics.LatestKey = resp.Contents[0].Key metrics.LatestBackup = resp.Contents[0].LastModified.UTC().Format(time.RFC3339) } return metrics, nil } func DetectIssues(metrics Metrics) Issues { issues := Issues{} if metrics.Guest != nil { if metrics.Guest.DiskPercent > guestDiskCriticalPercent { issues.GuestDiskUsageHigh = true } if metrics.Guest.MemoryPercent > guestMemoryCriticalPercent { issues.GuestMemoryUsageHigh = true } if !metrics.Guest.ServiceActive { issues.GatewayDown = true } if metrics.Guest.HTTPStatus != 200 { issues.HTTPUnhealthy = true } if metrics.Guest.Version != "" && !metrics.Guest.VersionConsistent { issues.VersionMismatch = true } } if metrics.Instance.Status == "active" && metrics.Host.State != "running" { issues.VMNotRunning = true } if metrics.Backup != nil && metrics.Backup.AgeHours > backupCriticalAgeHours { issues.BackupStale = true } return issues } func LoadInstances(registryPath string) ([]Instance, error) { data, err := exec.Command("cat", registryPath).Output() if err != nil { return nil, fmt.Errorf("failed to read registry: %w", err) } var registry struct { Instances []map[string]any `json:"instances"` } if err := json.Unmarshal(data, ®istry); err != nil { return nil, fmt.Errorf("failed to parse registry: %w", err) } instances := make([]Instance, 0, len(registry.Instances)) for _, rawInst := range registry.Instances { inst := Instance{} if name, ok := rawInst["name"].(string); ok { inst.Name = name } if domain, ok := rawInst["domain"].(string); ok { inst.Domain = domain } if host, ok := rawInst["host"].(string); ok && host != "" { inst.Host = &host } if user, ok := rawInst["user"].(string); ok { inst.User = user } if status, ok := rawInst["status"].(string); ok { inst.Status = status } inst.Additional = rawInst instances = append(instances, inst) } return instances, nil } func envDefault(key, def string) string { if v := os.Getenv(key); v != "" { return v } return def } func virshCmd(args ...string) (string, error) { cmd := exec.Command("virsh", append([]string{"-c", "qemu:///system"}, args...)...) output, err := cmd.CombinedOutput() return string(output), err } func sshCmd(host, user, command string) (string, error) { cmd := exec.Command("ssh", "-o", "ConnectTimeout=5", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "LogLevel=ERROR", "-q", fmt.Sprintf("%s@%s", user, host), command) output, err := cmd.CombinedOutput() return string(output), err } func parseVirshInfo(info, key string) string { re := regexp.MustCompile(fmt.Sprintf(`(?m)^%s:\s*(.*)$`, regexp.QuoteMeta(key))) match := re.FindStringSubmatch(info) if match != nil && len(match) > 1 { return match[1] } return "" } func parseMemoryKiB(memStr string) (int64, error) { memStr = strings.TrimSpace(strings.ToLower(memStr)) re := regexp.MustCompile(`^(\d+(?:\.\d+)?)\s*([kmgt]?)i?b$`) match := re.FindStringSubmatch(memStr) if match == nil || len(match) < 3 { return 0, fmt.Errorf("invalid memory format: %s", memStr) } value, err := strconv.ParseFloat(match[1], 64) if err != nil { return 0, fmt.Errorf("failed to parse memory value: %w", err) } unit := match[2] multiplier := int64(1) switch unit { case "k": multiplier = 1 case "m": multiplier = 1024 case "g": multiplier = 1024 * 1024 case "t": multiplier = 1024 * 1024 * 1024 } return int64(value * float64(multiplier)), nil } func parseCPUTimeNS(cpuStr string) (int64, error) { parts := strings.Fields(cpuStr) if len(parts) < 4 { return 0, fmt.Errorf("invalid CPU time format") } hours, _ := strconv.ParseFloat(parts[0], 64) minutes, _ := strconv.ParseFloat(parts[2], 64) seconds, _ := strconv.ParseFloat(strings.TrimSuffix(parts[4], "s"), 64) totalSeconds := hours*3600 + minutes*60 + seconds return int64(totalSeconds * 1e9), nil } func getDiskStats(path string) (actual, virtual int64, err error) { info, err := exec.Command("stat", "-c", "%s %b", path).Output() if err != nil { return 0, 0, err } fields := strings.Fields(string(info)) if len(fields) < 2 { return 0, 0, fmt.Errorf("invalid stat output") } blockSize, _ := strconv.ParseInt(fields[0], 10, 64) blockCount, _ := strconv.ParseInt(fields[1], 10, 64) actual = blockSize * blockCount return actual, 0, nil }