package main import ( "context" "encoding/json" "log" "os" "time" "agentmon/internal/monitor/openclaw" qnats "agentmon/internal/queue/nats" ) type Event struct { Schema map[string]any `json:"schema"` Event map[string]any `json:"event"` Payload map[string]any `json:"payload"` } func main() { natsURL := envDefault("NATS_URL", "nats://nats:4222") natsTopic := envDefault("NATS_TOPIC", "agentmon.events.v1") registryPath := envDefault("OPENCLAW_REGISTRY", "/home/will/.claude/state/openclaw-instances.json") interval := envDefault("POLL_INTERVAL", "30s") pub, err := qnats.NewPublisher(natsURL, natsTopic) if err != nil { log.Fatalf("failed to connect to NATS: %v", err) } defer pub.Close() pollDuration, err := time.ParseDuration(interval) if err != nil { log.Fatalf("invalid poll interval: %v", err) } ticker := time.NewTicker(pollDuration) defer ticker.Stop() ctx := context.Background() log.Printf("openclaw-monitor started, polling every %s", pollDuration) for { select { case <-ticker.C: if err := pollInstances(ctx, pub, registryPath); err != nil { log.Printf("poll error: %v", err) } } } } func pollInstances(ctx context.Context, pub *qnats.Publisher, registryPath string) error { instances, err := openclaw.LoadInstances(registryPath) if err != nil { return err } for _, instance := range instances { metrics := openclaw.Metrics{ Instance: instance, Timestamp: time.Now().UTC(), } hostMetrics, err := openclaw.CollectHostMetrics(instance.Domain) if err != nil { metrics.Error = err.Error() emitEvent(ctx, pub, instance.Name, metrics) continue } metrics.Host = hostMetrics if hostMetrics.State == "running" && instance.Host != nil { guestMetrics, err := openclaw.CollectGuestMetrics(*instance.Host, instance.User) if err != nil { log.Printf("guest collection failed for %s: %v", instance.Name, err) } else { metrics.Guest = guestMetrics } } backupStatus, err := openclaw.CollectBackupStatus(instance.Name) if err != nil { log.Printf("backup collection failed for %s: %v", instance.Name, err) } else { metrics.Backup = backupStatus } issues := openclaw.DetectIssues(metrics) if anyIssues(issues) { log.Printf("issues detected for %s: %+v", instance.Name, issues) } emitEvent(ctx, pub, instance.Name, metrics) } return nil } func emitEvent(ctx context.Context, pub *qnats.Publisher, instanceName string, metrics openclaw.Metrics) { event := Event{ Schema: map[string]any{ "name": "agentmon.openclaw", "version": 1, }, Event: map[string]any{ "id": generateID(), "type": "openclaw.snapshot", "ts": metrics.Timestamp.UTC().Format(time.RFC3339Nano), }, Payload: map[string]any{ "instance": metrics.Instance, "host": metrics.Host, }, } if metrics.Guest != nil { event.Payload["guest"] = metrics.Guest } if metrics.Backup != nil { event.Payload["backup"] = metrics.Backup } if metrics.Error != "" { event.Payload["error"] = metrics.Error } issues := openclaw.DetectIssues(metrics) if anyIssues(issues) { event.Payload["issues"] = issues } data, err := json.Marshal(event) if err != nil { log.Printf("failed to marshal event for %s: %v", instanceName, err) return } if err := pub.Publish(ctx, data); err != nil { log.Printf("failed to publish event for %s: %v", instanceName, err) } } func anyIssues(issues openclaw.Issues) bool { return issues.GuestDiskUsageHigh || issues.GuestMemoryUsageHigh || issues.HostDiskUsageHigh || issues.GatewayDown || issues.HTTPUnhealthy || issues.VersionMismatch || issues.VMNotRunning || issues.BackupStale } func generateID() string { return time.Now().Format("20060102150405") + "-" + randomString(8) } func randomString(n int) string { const chars = "abcdefghijklmnopqrstuvwxyz0123456789" b := make([]byte, n) for i := range b { b[i] = chars[time.Now().Nanosecond()%len(chars)] time.Sleep(time.Nanosecond) } return string(b) } func envDefault(key, def string) string { if v := os.Getenv(key); v != "" { return v } return def }