Files
2026-03-26 11:22:45 -07:00

435 lines
12 KiB
Go

package openclaw
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"sort"
"regexp"
"strconv"
"strings"
"time"
)
const (
guestDiskWarningPercent = 80.0
guestDiskCriticalPercent = 95.0
guestMemoryWarningPercent = 80.0
guestMemoryCriticalPercent = 95.0
hostDiskActualGB = 32.0
backupWarningAgeHours = 25.0
backupCriticalAgeHours = 48.0
)
func CollectHostMetrics(domain string) (HostMetrics, error) {
metrics := HostMetrics{}
state, err := virshCmd("domstate", domain)
if err != nil {
return metrics, fmt.Errorf("failed to get VM state: %w", err)
}
metrics.State = strings.TrimSpace(state)
if metrics.State != "running" {
metrics.VCPUs = 0
metrics.MemoryKiB = 0
metrics.Autostart = false
metrics.Snapshots = 0
return metrics, nil
}
info, err := virshCmd("dominfo", domain)
if err != nil {
return metrics, fmt.Errorf("failed to get VM info: %w", err)
}
if cpuStr := parseVirshInfo(info, "CPU(s)"); cpuStr != "" {
if cpu, err := strconv.Atoi(strings.TrimSpace(cpuStr)); err == nil {
metrics.VCPUs = cpu
}
}
if memStr := parseVirshInfo(info, "Max memory"); memStr != "" {
if mem, err := parseMemoryKiB(memStr); err == nil {
metrics.MemoryKiB = mem
}
}
autostartInfo := parseVirshInfo(info, "Autostart")
metrics.Autostart = strings.TrimSpace(autostartInfo) == "enable"
cpuTime, err := virshCmd("dominfo", domain)
if err == nil {
if cpuStr := parseVirshInfo(cpuTime, "CPU time"); cpuStr != "" {
if cpu, err := parseCPUTimeNS(cpuStr); err == nil {
metrics.CPUTime = cpu
}
}
}
snapshots, err := virshCmd("snapshot-list", domain, "--name")
if err == nil {
metrics.Snapshots = len(strings.Fields(strings.TrimSpace(snapshots)))
}
diskPath, err := virshCmd("domblklist", domain)
if err == nil {
lines := strings.Split(diskPath, "\n")
for _, line := range lines {
fields := strings.Fields(line)
if len(fields) >= 2 && fields[1] != "" && fields[1] != "-" {
diskActual, _, err := getDiskStats(fields[1])
if err == nil {
metrics.DiskActual = diskActual
}
}
}
}
return metrics, nil
}
func CollectGuestMetrics(host, user string) (*GuestMetrics, error) {
metrics := &GuestMetrics{}
serviceStatus, err := sshCmd(host, user, "systemctl --user is-active openclaw-gateway.service")
if err == nil {
metrics.ServiceActive = strings.TrimSpace(serviceStatus) == "active"
}
if metrics.ServiceActive {
uptime, err := sshCmd(host, user, "systemctl --user show openclaw-gateway.service -p ActiveEnterTimestamp --value")
if err == nil {
uptimeTS := strings.TrimSpace(uptime)
if ts, err := time.Parse("Mon 2006-01-02 15:04:05 MST", uptimeTS); err == nil {
duration := time.Since(ts)
metrics.ServiceUptime = duration.String()
}
}
}
httpCode, err := sshCmd(host, user, "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:18789/")
if err == nil {
if code, err := strconv.Atoi(strings.TrimSpace(httpCode)); err == nil {
metrics.HTTPStatus = code
}
}
version, err := sshCmd(host, user, "ls -la ~/.local/share/pnpm/5/node_modules/openclaw | grep -oP 'openclaw@[0-9.]+' | head -1")
if err == nil {
metrics.Version = strings.TrimSpace(strings.TrimPrefix(version, "openclaw@"))
if metrics.Version != "" {
serviceVersion, err := sshCmd(host, user, "grep OPENCLAW_SERVICE_VERSION ~/.config/systemd/user/openclaw-gateway.service 2>/dev/null | head -1")
if err == nil && strings.Contains(serviceVersion, metrics.Version) {
metrics.VersionConsistent = true
}
}
}
memInfo, err := sshCmd(host, user, "free -b | grep '^Mem:'")
if err == nil {
fields := strings.Fields(memInfo)
if len(fields) >= 3 {
if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
metrics.MemoryTotal = total
}
if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil {
metrics.MemoryUsed = used
}
if metrics.MemoryTotal > 0 {
metrics.MemoryPercent = float64(metrics.MemoryUsed) / float64(metrics.MemoryTotal) * 100
}
}
}
diskInfo, err := sshCmd(host, user, "df -B1 / | tail -1")
if err == nil {
fields := strings.Fields(diskInfo)
if len(fields) >= 5 {
if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
metrics.DiskTotal = total
}
if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil {
metrics.DiskUsed = used
}
if metrics.DiskTotal > 0 {
metrics.DiskPercent = float64(metrics.DiskUsed) / float64(metrics.DiskTotal) * 100
}
}
}
loadAvg, err := sshCmd(host, user, "awk '{print $1}' /proc/loadavg")
if err == nil {
if load, err := strconv.ParseFloat(strings.TrimSpace(loadAvg), 64); err == nil {
metrics.LoadAverage = load
}
}
swappiness, err := sshCmd(host, user, "cat /proc/sys/vm/swappiness")
if err == nil {
if swap, err := strconv.Atoi(strings.TrimSpace(swappiness)); err == nil {
metrics.Swappiness = swap
}
}
return metrics, nil
}
func CollectBackupStatus(instanceName string) (*BackupStatus, error) {
backupPath := "/home/will/lab/swarm/openclaw"
fileInfo, err := exec.Command("stat", "-c", "%Y", backupPath).Output()
if err != nil {
return nil, fmt.Errorf("failed to get backup timestamp: %w", err)
}
timestampStr := strings.TrimSpace(string(fileInfo))
timestamp, err := strconv.ParseInt(timestampStr, 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to parse backup timestamp: %w", err)
}
lastBackup := time.Unix(timestamp, 0)
age := time.Since(lastBackup)
ageHours := age.Hours()
return &BackupStatus{
LastBackup: lastBackup.UTC().Format(time.RFC3339),
AgeHours: ageHours,
}, nil
}
func CollectMinIOMetrics(instanceName string) (*MinIOMetrics, error) {
targetInstance := envDefault("OPENCLAW_MINIO_INSTANCE", "zap")
if instanceName != targetInstance {
return nil, nil
}
endpoint := envDefault("OPENCLAW_MINIO_ENDPOINT", "http://192.168.153.253:9000")
bucket := envDefault("OPENCLAW_MINIO_BUCKET", "zap")
prefix := envDefault("OPENCLAW_MINIO_PREFIX", "backups")
metrics := &MinIOMetrics{
Endpoint: endpoint,
Bucket: bucket,
Prefix: prefix,
}
healthURL := strings.TrimRight(endpoint, "/") + "/minio/health/live"
statusOutput, err := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "--connect-timeout", "5", healthURL).CombinedOutput()
if err != nil {
metrics.Error = fmt.Sprintf("health check failed: %v", err)
return metrics, nil
}
if code, err := strconv.Atoi(strings.TrimSpace(string(statusOutput))); err == nil {
metrics.HTTPStatus = code
metrics.Reachable = code == 200
}
listOutput, err := exec.Command(
"aws",
"--endpoint-url", endpoint,
"s3api", "list-objects-v2",
"--bucket", bucket,
"--prefix", strings.Trim(prefix, "/")+"/",
"--output", "json",
).CombinedOutput()
if err != nil {
msg := strings.TrimSpace(string(listOutput))
if msg == "" {
msg = err.Error()
}
metrics.Error = msg
return metrics, nil
}
var resp struct {
Contents []struct {
Key string `json:"Key"`
Size int64 `json:"Size"`
LastModified time.Time `json:"LastModified"`
} `json:"Contents"`
}
if err := json.Unmarshal(listOutput, &resp); err != nil {
metrics.Error = fmt.Sprintf("invalid usage response: %v", err)
return metrics, nil
}
metrics.ObjectCount = len(resp.Contents)
for _, obj := range resp.Contents {
metrics.TotalBytes += obj.Size
}
if len(resp.Contents) > 0 {
sort.Slice(resp.Contents, func(i, j int) bool {
return resp.Contents[i].LastModified.After(resp.Contents[j].LastModified)
})
metrics.LatestKey = resp.Contents[0].Key
metrics.LatestBackup = resp.Contents[0].LastModified.UTC().Format(time.RFC3339)
}
return metrics, nil
}
func DetectIssues(metrics Metrics) Issues {
issues := Issues{}
if metrics.Guest != nil {
if metrics.Guest.DiskPercent > guestDiskCriticalPercent {
issues.GuestDiskUsageHigh = true
}
if metrics.Guest.MemoryPercent > guestMemoryCriticalPercent {
issues.GuestMemoryUsageHigh = true
}
if !metrics.Guest.ServiceActive {
issues.GatewayDown = true
}
if metrics.Guest.HTTPStatus != 200 {
issues.HTTPUnhealthy = true
}
if metrics.Guest.Version != "" && !metrics.Guest.VersionConsistent {
issues.VersionMismatch = true
}
}
if metrics.Instance.Status == "active" && metrics.Host.State != "running" {
issues.VMNotRunning = true
}
if metrics.Backup != nil && metrics.Backup.AgeHours > backupCriticalAgeHours {
issues.BackupStale = true
}
return issues
}
func LoadInstances(registryPath string) ([]Instance, error) {
data, err := exec.Command("cat", registryPath).Output()
if err != nil {
return nil, fmt.Errorf("failed to read registry: %w", err)
}
var registry struct {
Instances []map[string]any `json:"instances"`
}
if err := json.Unmarshal(data, &registry); err != nil {
return nil, fmt.Errorf("failed to parse registry: %w", err)
}
instances := make([]Instance, 0, len(registry.Instances))
for _, rawInst := range registry.Instances {
inst := Instance{}
if name, ok := rawInst["name"].(string); ok {
inst.Name = name
}
if domain, ok := rawInst["domain"].(string); ok {
inst.Domain = domain
}
if host, ok := rawInst["host"].(string); ok && host != "" {
inst.Host = &host
}
if user, ok := rawInst["user"].(string); ok {
inst.User = user
}
if status, ok := rawInst["status"].(string); ok {
inst.Status = status
}
inst.Additional = rawInst
instances = append(instances, inst)
}
return instances, nil
}
func envDefault(key, def string) string {
if v := os.Getenv(key); v != "" {
return v
}
return def
}
func virshCmd(args ...string) (string, error) {
cmd := exec.Command("virsh", append([]string{"-c", "qemu:///system"}, args...)...)
output, err := cmd.CombinedOutput()
return string(output), err
}
func sshCmd(host, user, command string) (string, error) {
cmd := exec.Command("ssh", "-o", "ConnectTimeout=5", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "LogLevel=ERROR", "-q", fmt.Sprintf("%s@%s", user, host), command)
output, err := cmd.CombinedOutput()
return string(output), err
}
func parseVirshInfo(info, key string) string {
re := regexp.MustCompile(fmt.Sprintf(`(?m)^%s:\s*(.*)$`, regexp.QuoteMeta(key)))
match := re.FindStringSubmatch(info)
if match != nil && len(match) > 1 {
return match[1]
}
return ""
}
func parseMemoryKiB(memStr string) (int64, error) {
memStr = strings.TrimSpace(strings.ToLower(memStr))
re := regexp.MustCompile(`^(\d+(?:\.\d+)?)\s*([kmgt]?)i?b$`)
match := re.FindStringSubmatch(memStr)
if match == nil || len(match) < 3 {
return 0, fmt.Errorf("invalid memory format: %s", memStr)
}
value, err := strconv.ParseFloat(match[1], 64)
if err != nil {
return 0, fmt.Errorf("failed to parse memory value: %w", err)
}
unit := match[2]
multiplier := int64(1)
switch unit {
case "k":
multiplier = 1
case "m":
multiplier = 1024
case "g":
multiplier = 1024 * 1024
case "t":
multiplier = 1024 * 1024 * 1024
}
return int64(value * float64(multiplier)), nil
}
func parseCPUTimeNS(cpuStr string) (int64, error) {
parts := strings.Fields(cpuStr)
if len(parts) < 4 {
return 0, fmt.Errorf("invalid CPU time format")
}
hours, _ := strconv.ParseFloat(parts[0], 64)
minutes, _ := strconv.ParseFloat(parts[2], 64)
seconds, _ := strconv.ParseFloat(strings.TrimSuffix(parts[4], "s"), 64)
totalSeconds := hours*3600 + minutes*60 + seconds
return int64(totalSeconds * 1e9), nil
}
func getDiskStats(path string) (actual, virtual int64, err error) {
info, err := exec.Command("stat", "-c", "%s %b", path).Output()
if err != nil {
return 0, 0, err
}
fields := strings.Fields(string(info))
if len(fields) < 2 {
return 0, 0, fmt.Errorf("invalid stat output")
}
blockSize, _ := strconv.ParseInt(fields[0], 10, 64)
blockCount, _ := strconv.ParseInt(fields[1], 10, 64)
actual = blockSize * blockCount
return actual, 0, nil
}