feat: complete agent monitoring - hook, UI, and backend filter
- Add event_type and framework filters to events query endpoint - Add /agents SPA route to web-ui server - Add Agents nav link and route in frontend - Add agents page CSS (timeline, VM pills, stats panel) - Build VM status strip, activity timeline, and real-time stats - Add agentmon hook for OpenClaw (HOOK.md + handler.ts) - Add docker-compose, Dockerfile, and supporting infra files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"agentmon/internal/monitor/openclaw"
|
||||
qnats "agentmon/internal/queue/nats"
|
||||
)
|
||||
|
||||
type Event struct {
|
||||
Schema map[string]any `json:"schema"`
|
||||
Event map[string]any `json:"event"`
|
||||
Payload map[string]any `json:"payload"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
natsURL := envDefault("NATS_URL", "nats://nats:4222")
|
||||
natsTopic := envDefault("NATS_TOPIC", "agentmon.events.v1")
|
||||
registryPath := envDefault("OPENCLAW_REGISTRY", "/home/will/.claude/state/openclaw-instances.json")
|
||||
interval := envDefault("POLL_INTERVAL", "30s")
|
||||
|
||||
pub, err := qnats.NewPublisher(natsURL, natsTopic)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to connect to NATS: %v", err)
|
||||
}
|
||||
defer pub.Close()
|
||||
|
||||
pollDuration, err := time.ParseDuration(interval)
|
||||
if err != nil {
|
||||
log.Fatalf("invalid poll interval: %v", err)
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(pollDuration)
|
||||
defer ticker.Stop()
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
log.Printf("openclaw-monitor started, polling every %s", pollDuration)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
if err := pollInstances(ctx, pub, registryPath); err != nil {
|
||||
log.Printf("poll error: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func pollInstances(ctx context.Context, pub *qnats.Publisher, registryPath string) error {
|
||||
instances, err := openclaw.LoadInstances(registryPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, instance := range instances {
|
||||
metrics := openclaw.Metrics{
|
||||
Instance: instance,
|
||||
Timestamp: time.Now().UTC(),
|
||||
}
|
||||
|
||||
hostMetrics, err := openclaw.CollectHostMetrics(instance.Domain)
|
||||
if err != nil {
|
||||
metrics.Error = err.Error()
|
||||
emitEvent(ctx, pub, instance.Name, metrics)
|
||||
continue
|
||||
}
|
||||
metrics.Host = hostMetrics
|
||||
|
||||
if hostMetrics.State == "running" && instance.Host != nil {
|
||||
guestMetrics, err := openclaw.CollectGuestMetrics(*instance.Host, instance.User)
|
||||
if err != nil {
|
||||
log.Printf("guest collection failed for %s: %v", instance.Name, err)
|
||||
} else {
|
||||
metrics.Guest = guestMetrics
|
||||
}
|
||||
}
|
||||
|
||||
backupStatus, err := openclaw.CollectBackupStatus(instance.Name)
|
||||
if err != nil {
|
||||
log.Printf("backup collection failed for %s: %v", instance.Name, err)
|
||||
} else {
|
||||
metrics.Backup = backupStatus
|
||||
}
|
||||
|
||||
issues := openclaw.DetectIssues(metrics)
|
||||
if anyIssues(issues) {
|
||||
log.Printf("issues detected for %s: %+v", instance.Name, issues)
|
||||
}
|
||||
|
||||
emitEvent(ctx, pub, instance.Name, metrics)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func emitEvent(ctx context.Context, pub *qnats.Publisher, instanceName string, metrics openclaw.Metrics) {
|
||||
event := Event{
|
||||
Schema: map[string]any{
|
||||
"name": "agentmon.openclaw",
|
||||
"version": 1,
|
||||
},
|
||||
Event: map[string]any{
|
||||
"id": generateID(),
|
||||
"type": "openclaw.snapshot",
|
||||
"ts": metrics.Timestamp.UTC().Format(time.RFC3339Nano),
|
||||
},
|
||||
Payload: map[string]any{
|
||||
"instance": metrics.Instance,
|
||||
"host": metrics.Host,
|
||||
},
|
||||
}
|
||||
|
||||
if metrics.Guest != nil {
|
||||
event.Payload["guest"] = metrics.Guest
|
||||
}
|
||||
if metrics.Backup != nil {
|
||||
event.Payload["backup"] = metrics.Backup
|
||||
}
|
||||
if metrics.Error != "" {
|
||||
event.Payload["error"] = metrics.Error
|
||||
}
|
||||
|
||||
issues := openclaw.DetectIssues(metrics)
|
||||
if anyIssues(issues) {
|
||||
event.Payload["issues"] = issues
|
||||
}
|
||||
|
||||
data, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
log.Printf("failed to marshal event for %s: %v", instanceName, err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := pub.Publish(ctx, data); err != nil {
|
||||
log.Printf("failed to publish event for %s: %v", instanceName, err)
|
||||
}
|
||||
}
|
||||
|
||||
func anyIssues(issues openclaw.Issues) bool {
|
||||
return issues.GuestDiskUsageHigh ||
|
||||
issues.GuestMemoryUsageHigh ||
|
||||
issues.HostDiskUsageHigh ||
|
||||
issues.GatewayDown ||
|
||||
issues.HTTPUnhealthy ||
|
||||
issues.VersionMismatch ||
|
||||
issues.VMNotRunning ||
|
||||
issues.BackupStale
|
||||
}
|
||||
|
||||
func generateID() string {
|
||||
return time.Now().Format("20060102150405") + "-" + randomString(8)
|
||||
}
|
||||
|
||||
func randomString(n int) string {
|
||||
const chars = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
b := make([]byte, n)
|
||||
for i := range b {
|
||||
b[i] = chars[time.Now().Nanosecond()%len(chars)]
|
||||
time.Sleep(time.Nanosecond)
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
|
||||
func envDefault(key, def string) string {
|
||||
if v := os.Getenv(key); v != "" {
|
||||
return v
|
||||
}
|
||||
return def
|
||||
}
|
||||
Reference in New Issue
Block a user