435 lines
12 KiB
Go
435 lines
12 KiB
Go
package openclaw
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"sort"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
guestDiskWarningPercent = 80.0
|
|
guestDiskCriticalPercent = 95.0
|
|
guestMemoryWarningPercent = 80.0
|
|
guestMemoryCriticalPercent = 95.0
|
|
hostDiskActualGB = 32.0
|
|
backupWarningAgeHours = 25.0
|
|
backupCriticalAgeHours = 48.0
|
|
)
|
|
|
|
func CollectHostMetrics(domain string) (HostMetrics, error) {
|
|
metrics := HostMetrics{}
|
|
|
|
state, err := virshCmd("domstate", domain)
|
|
if err != nil {
|
|
return metrics, fmt.Errorf("failed to get VM state: %w", err)
|
|
}
|
|
metrics.State = strings.TrimSpace(state)
|
|
|
|
if metrics.State != "running" {
|
|
metrics.VCPUs = 0
|
|
metrics.MemoryKiB = 0
|
|
metrics.Autostart = false
|
|
metrics.Snapshots = 0
|
|
return metrics, nil
|
|
}
|
|
|
|
info, err := virshCmd("dominfo", domain)
|
|
if err != nil {
|
|
return metrics, fmt.Errorf("failed to get VM info: %w", err)
|
|
}
|
|
|
|
if cpuStr := parseVirshInfo(info, "CPU(s)"); cpuStr != "" {
|
|
if cpu, err := strconv.Atoi(strings.TrimSpace(cpuStr)); err == nil {
|
|
metrics.VCPUs = cpu
|
|
}
|
|
}
|
|
if memStr := parseVirshInfo(info, "Max memory"); memStr != "" {
|
|
if mem, err := parseMemoryKiB(memStr); err == nil {
|
|
metrics.MemoryKiB = mem
|
|
}
|
|
}
|
|
|
|
autostartInfo := parseVirshInfo(info, "Autostart")
|
|
metrics.Autostart = strings.TrimSpace(autostartInfo) == "enable"
|
|
|
|
cpuTime, err := virshCmd("dominfo", domain)
|
|
if err == nil {
|
|
if cpuStr := parseVirshInfo(cpuTime, "CPU time"); cpuStr != "" {
|
|
if cpu, err := parseCPUTimeNS(cpuStr); err == nil {
|
|
metrics.CPUTime = cpu
|
|
}
|
|
}
|
|
}
|
|
|
|
snapshots, err := virshCmd("snapshot-list", domain, "--name")
|
|
if err == nil {
|
|
metrics.Snapshots = len(strings.Fields(strings.TrimSpace(snapshots)))
|
|
}
|
|
|
|
diskPath, err := virshCmd("domblklist", domain)
|
|
if err == nil {
|
|
lines := strings.Split(diskPath, "\n")
|
|
for _, line := range lines {
|
|
fields := strings.Fields(line)
|
|
if len(fields) >= 2 && fields[1] != "" && fields[1] != "-" {
|
|
diskActual, _, err := getDiskStats(fields[1])
|
|
if err == nil {
|
|
metrics.DiskActual = diskActual
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
func CollectGuestMetrics(host, user string) (*GuestMetrics, error) {
|
|
metrics := &GuestMetrics{}
|
|
|
|
serviceStatus, err := sshCmd(host, user, "systemctl --user is-active openclaw-gateway.service")
|
|
if err == nil {
|
|
metrics.ServiceActive = strings.TrimSpace(serviceStatus) == "active"
|
|
}
|
|
|
|
if metrics.ServiceActive {
|
|
uptime, err := sshCmd(host, user, "systemctl --user show openclaw-gateway.service -p ActiveEnterTimestamp --value")
|
|
if err == nil {
|
|
uptimeTS := strings.TrimSpace(uptime)
|
|
if ts, err := time.Parse("Mon 2006-01-02 15:04:05 MST", uptimeTS); err == nil {
|
|
duration := time.Since(ts)
|
|
metrics.ServiceUptime = duration.String()
|
|
}
|
|
}
|
|
}
|
|
|
|
httpCode, err := sshCmd(host, user, "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:18789/")
|
|
if err == nil {
|
|
if code, err := strconv.Atoi(strings.TrimSpace(httpCode)); err == nil {
|
|
metrics.HTTPStatus = code
|
|
}
|
|
}
|
|
|
|
version, err := sshCmd(host, user, "ls -la ~/.local/share/pnpm/5/node_modules/openclaw | grep -oP 'openclaw@[0-9.]+' | head -1")
|
|
if err == nil {
|
|
metrics.Version = strings.TrimSpace(strings.TrimPrefix(version, "openclaw@"))
|
|
if metrics.Version != "" {
|
|
serviceVersion, err := sshCmd(host, user, "grep OPENCLAW_SERVICE_VERSION ~/.config/systemd/user/openclaw-gateway.service 2>/dev/null | head -1")
|
|
if err == nil && strings.Contains(serviceVersion, metrics.Version) {
|
|
metrics.VersionConsistent = true
|
|
}
|
|
}
|
|
}
|
|
|
|
memInfo, err := sshCmd(host, user, "free -b | grep '^Mem:'")
|
|
if err == nil {
|
|
fields := strings.Fields(memInfo)
|
|
if len(fields) >= 3 {
|
|
if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
|
|
metrics.MemoryTotal = total
|
|
}
|
|
if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil {
|
|
metrics.MemoryUsed = used
|
|
}
|
|
if metrics.MemoryTotal > 0 {
|
|
metrics.MemoryPercent = float64(metrics.MemoryUsed) / float64(metrics.MemoryTotal) * 100
|
|
}
|
|
}
|
|
}
|
|
|
|
diskInfo, err := sshCmd(host, user, "df -B1 / | tail -1")
|
|
if err == nil {
|
|
fields := strings.Fields(diskInfo)
|
|
if len(fields) >= 5 {
|
|
if total, err := strconv.ParseInt(fields[1], 10, 64); err == nil {
|
|
metrics.DiskTotal = total
|
|
}
|
|
if used, err := strconv.ParseInt(fields[2], 10, 64); err == nil {
|
|
metrics.DiskUsed = used
|
|
}
|
|
if metrics.DiskTotal > 0 {
|
|
metrics.DiskPercent = float64(metrics.DiskUsed) / float64(metrics.DiskTotal) * 100
|
|
}
|
|
}
|
|
}
|
|
|
|
loadAvg, err := sshCmd(host, user, "awk '{print $1}' /proc/loadavg")
|
|
if err == nil {
|
|
if load, err := strconv.ParseFloat(strings.TrimSpace(loadAvg), 64); err == nil {
|
|
metrics.LoadAverage = load
|
|
}
|
|
}
|
|
|
|
swappiness, err := sshCmd(host, user, "cat /proc/sys/vm/swappiness")
|
|
if err == nil {
|
|
if swap, err := strconv.Atoi(strings.TrimSpace(swappiness)); err == nil {
|
|
metrics.Swappiness = swap
|
|
}
|
|
}
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
func CollectBackupStatus(instanceName string) (*BackupStatus, error) {
|
|
backupPath := "/home/will/lab/swarm/openclaw"
|
|
fileInfo, err := exec.Command("stat", "-c", "%Y", backupPath).Output()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get backup timestamp: %w", err)
|
|
}
|
|
|
|
timestampStr := strings.TrimSpace(string(fileInfo))
|
|
timestamp, err := strconv.ParseInt(timestampStr, 10, 64)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse backup timestamp: %w", err)
|
|
}
|
|
|
|
lastBackup := time.Unix(timestamp, 0)
|
|
age := time.Since(lastBackup)
|
|
ageHours := age.Hours()
|
|
|
|
return &BackupStatus{
|
|
LastBackup: lastBackup.UTC().Format(time.RFC3339),
|
|
AgeHours: ageHours,
|
|
}, nil
|
|
}
|
|
|
|
func CollectMinIOMetrics(instanceName string) (*MinIOMetrics, error) {
|
|
targetInstance := envDefault("OPENCLAW_MINIO_INSTANCE", "zap")
|
|
if instanceName != targetInstance {
|
|
return nil, nil
|
|
}
|
|
|
|
endpoint := envDefault("OPENCLAW_MINIO_ENDPOINT", "http://192.168.153.253:9000")
|
|
bucket := envDefault("OPENCLAW_MINIO_BUCKET", "zap")
|
|
prefix := envDefault("OPENCLAW_MINIO_PREFIX", "backups")
|
|
|
|
metrics := &MinIOMetrics{
|
|
Endpoint: endpoint,
|
|
Bucket: bucket,
|
|
Prefix: prefix,
|
|
}
|
|
|
|
healthURL := strings.TrimRight(endpoint, "/") + "/minio/health/live"
|
|
statusOutput, err := exec.Command("curl", "-s", "-o", "/dev/null", "-w", "%{http_code}", "--connect-timeout", "5", healthURL).CombinedOutput()
|
|
if err != nil {
|
|
metrics.Error = fmt.Sprintf("health check failed: %v", err)
|
|
return metrics, nil
|
|
}
|
|
if code, err := strconv.Atoi(strings.TrimSpace(string(statusOutput))); err == nil {
|
|
metrics.HTTPStatus = code
|
|
metrics.Reachable = code == 200
|
|
}
|
|
|
|
listOutput, err := exec.Command(
|
|
"aws",
|
|
"--endpoint-url", endpoint,
|
|
"s3api", "list-objects-v2",
|
|
"--bucket", bucket,
|
|
"--prefix", strings.Trim(prefix, "/")+"/",
|
|
"--output", "json",
|
|
).CombinedOutput()
|
|
if err != nil {
|
|
msg := strings.TrimSpace(string(listOutput))
|
|
if msg == "" {
|
|
msg = err.Error()
|
|
}
|
|
metrics.Error = msg
|
|
return metrics, nil
|
|
}
|
|
|
|
var resp struct {
|
|
Contents []struct {
|
|
Key string `json:"Key"`
|
|
Size int64 `json:"Size"`
|
|
LastModified time.Time `json:"LastModified"`
|
|
} `json:"Contents"`
|
|
}
|
|
if err := json.Unmarshal(listOutput, &resp); err != nil {
|
|
metrics.Error = fmt.Sprintf("invalid usage response: %v", err)
|
|
return metrics, nil
|
|
}
|
|
|
|
metrics.ObjectCount = len(resp.Contents)
|
|
for _, obj := range resp.Contents {
|
|
metrics.TotalBytes += obj.Size
|
|
}
|
|
|
|
if len(resp.Contents) > 0 {
|
|
sort.Slice(resp.Contents, func(i, j int) bool {
|
|
return resp.Contents[i].LastModified.After(resp.Contents[j].LastModified)
|
|
})
|
|
metrics.LatestKey = resp.Contents[0].Key
|
|
metrics.LatestBackup = resp.Contents[0].LastModified.UTC().Format(time.RFC3339)
|
|
}
|
|
|
|
return metrics, nil
|
|
}
|
|
|
|
func DetectIssues(metrics Metrics) Issues {
|
|
issues := Issues{}
|
|
|
|
if metrics.Guest != nil {
|
|
if metrics.Guest.DiskPercent > guestDiskCriticalPercent {
|
|
issues.GuestDiskUsageHigh = true
|
|
}
|
|
if metrics.Guest.MemoryPercent > guestMemoryCriticalPercent {
|
|
issues.GuestMemoryUsageHigh = true
|
|
}
|
|
if !metrics.Guest.ServiceActive {
|
|
issues.GatewayDown = true
|
|
}
|
|
if metrics.Guest.HTTPStatus != 200 {
|
|
issues.HTTPUnhealthy = true
|
|
}
|
|
if metrics.Guest.Version != "" && !metrics.Guest.VersionConsistent {
|
|
issues.VersionMismatch = true
|
|
}
|
|
}
|
|
|
|
if metrics.Instance.Status == "active" && metrics.Host.State != "running" {
|
|
issues.VMNotRunning = true
|
|
}
|
|
|
|
if metrics.Backup != nil && metrics.Backup.AgeHours > backupCriticalAgeHours {
|
|
issues.BackupStale = true
|
|
}
|
|
|
|
return issues
|
|
}
|
|
|
|
func LoadInstances(registryPath string) ([]Instance, error) {
|
|
data, err := exec.Command("cat", registryPath).Output()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read registry: %w", err)
|
|
}
|
|
|
|
var registry struct {
|
|
Instances []map[string]any `json:"instances"`
|
|
}
|
|
|
|
if err := json.Unmarshal(data, ®istry); err != nil {
|
|
return nil, fmt.Errorf("failed to parse registry: %w", err)
|
|
}
|
|
|
|
instances := make([]Instance, 0, len(registry.Instances))
|
|
for _, rawInst := range registry.Instances {
|
|
inst := Instance{}
|
|
|
|
if name, ok := rawInst["name"].(string); ok {
|
|
inst.Name = name
|
|
}
|
|
if domain, ok := rawInst["domain"].(string); ok {
|
|
inst.Domain = domain
|
|
}
|
|
if host, ok := rawInst["host"].(string); ok && host != "" {
|
|
inst.Host = &host
|
|
}
|
|
if user, ok := rawInst["user"].(string); ok {
|
|
inst.User = user
|
|
}
|
|
if status, ok := rawInst["status"].(string); ok {
|
|
inst.Status = status
|
|
}
|
|
|
|
inst.Additional = rawInst
|
|
|
|
instances = append(instances, inst)
|
|
}
|
|
|
|
return instances, nil
|
|
}
|
|
|
|
func envDefault(key, def string) string {
|
|
if v := os.Getenv(key); v != "" {
|
|
return v
|
|
}
|
|
return def
|
|
}
|
|
|
|
func virshCmd(args ...string) (string, error) {
|
|
cmd := exec.Command("virsh", append([]string{"-c", "qemu:///system"}, args...)...)
|
|
output, err := cmd.CombinedOutput()
|
|
return string(output), err
|
|
}
|
|
|
|
func sshCmd(host, user, command string) (string, error) {
|
|
cmd := exec.Command("ssh", "-o", "ConnectTimeout=5", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-o", "LogLevel=ERROR", "-q", fmt.Sprintf("%s@%s", user, host), command)
|
|
output, err := cmd.CombinedOutput()
|
|
return string(output), err
|
|
}
|
|
|
|
func parseVirshInfo(info, key string) string {
|
|
re := regexp.MustCompile(fmt.Sprintf(`(?m)^%s:\s*(.*)$`, regexp.QuoteMeta(key)))
|
|
match := re.FindStringSubmatch(info)
|
|
if match != nil && len(match) > 1 {
|
|
return match[1]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func parseMemoryKiB(memStr string) (int64, error) {
|
|
memStr = strings.TrimSpace(strings.ToLower(memStr))
|
|
re := regexp.MustCompile(`^(\d+(?:\.\d+)?)\s*([kmgt]?)i?b$`)
|
|
match := re.FindStringSubmatch(memStr)
|
|
if match == nil || len(match) < 3 {
|
|
return 0, fmt.Errorf("invalid memory format: %s", memStr)
|
|
}
|
|
|
|
value, err := strconv.ParseFloat(match[1], 64)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed to parse memory value: %w", err)
|
|
}
|
|
|
|
unit := match[2]
|
|
multiplier := int64(1)
|
|
switch unit {
|
|
case "k":
|
|
multiplier = 1
|
|
case "m":
|
|
multiplier = 1024
|
|
case "g":
|
|
multiplier = 1024 * 1024
|
|
case "t":
|
|
multiplier = 1024 * 1024 * 1024
|
|
}
|
|
|
|
return int64(value * float64(multiplier)), nil
|
|
}
|
|
|
|
func parseCPUTimeNS(cpuStr string) (int64, error) {
|
|
parts := strings.Fields(cpuStr)
|
|
if len(parts) < 4 {
|
|
return 0, fmt.Errorf("invalid CPU time format")
|
|
}
|
|
|
|
hours, _ := strconv.ParseFloat(parts[0], 64)
|
|
minutes, _ := strconv.ParseFloat(parts[2], 64)
|
|
seconds, _ := strconv.ParseFloat(strings.TrimSuffix(parts[4], "s"), 64)
|
|
|
|
totalSeconds := hours*3600 + minutes*60 + seconds
|
|
return int64(totalSeconds * 1e9), nil
|
|
}
|
|
|
|
func getDiskStats(path string) (actual, virtual int64, err error) {
|
|
info, err := exec.Command("stat", "-c", "%s %b", path).Output()
|
|
if err != nil {
|
|
return 0, 0, err
|
|
}
|
|
|
|
fields := strings.Fields(string(info))
|
|
if len(fields) < 2 {
|
|
return 0, 0, fmt.Errorf("invalid stat output")
|
|
}
|
|
|
|
blockSize, _ := strconv.ParseInt(fields[0], 10, 64)
|
|
blockCount, _ := strconv.ParseInt(fields[1], 10, 64)
|
|
actual = blockSize * blockCount
|
|
|
|
return actual, 0, nil
|
|
}
|