From f8ddea3698935216ec572d55a7201419d5cd15b7 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Wed, 18 Mar 2026 13:41:26 -0700 Subject: [PATCH] feat: add agentmon services section to infrastructure page Label all agentmon docker-compose services with agentmon.monitor=true and agentmon.group=agentmon so the swarm-monitor picks them up. Adds Group field to ServiceSnapshot, probes /healthz for api/web roles, and renders a separate "Agentmon" section below Swarm Services on the Infrastructure page with new api and worker card renderers. Co-Authored-By: Claude Sonnet 4.6 --- cmd/query-api/main.go | 13 ++++++++ cmd/web-ui/static/app.js | 49 ++++++++++++++++++++++++++--- docker-compose.yaml | 35 +++++++++++++++++++++ internal/monitor/swarm/collector.go | 5 +++ internal/monitor/swarm/types.go | 1 + internal/store/postgres/runs.go | 8 +++-- internal/store/postgres/stats.go | 42 +++++++++++++++++++++++++ 7 files changed, 147 insertions(+), 6 deletions(-) diff --git a/cmd/query-api/main.go b/cmd/query-api/main.go index 0b58176..c4a3afd 100644 --- a/cmd/query-api/main.go +++ b/cmd/query-api/main.go @@ -218,6 +218,19 @@ func main() { httpx.WriteJSON(w, http.StatusOK, summary) }) + r.Get("/v1/stats/top-tools", func(w http.ResponseWriter, r *http.Request) { + limit, _ := strconv.Atoi(r.URL.Query().Get("limit")) + tools, err := db.GetTopTools(r.Context(), limit) + if err != nil { + httpx.WriteJSON(w, http.StatusInternalServerError, map[string]any{"error": "db_error"}) + return + } + if tools == nil { + tools = []postgres.TopTool{} + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"tools": tools}) + }) + r.Get("/v1/stats/timeseries", func(w http.ResponseWriter, r *http.Request) { window := r.URL.Query().Get("window") switch window { diff --git a/cmd/web-ui/static/app.js b/cmd/web-ui/static/app.js index c62c918..9d5b6c8 100644 --- a/cmd/web-ui/static/app.js +++ b/cmd/web-ui/static/app.js @@ -751,7 +751,9 @@ function renderInfraGrid() { const vmNames = Object.keys(openclawState.instances).sort(); - const services = Object.values(swarmState.services); + const allServices = Object.values(swarmState.services); + const agentmonServices = allServices.filter(s => s.group === 'agentmon'); + const swarmServices = allServices.filter(s => s.group !== 'agentmon'); app.innerHTML = `
-

Services

- ${services.length === 0 +

Swarm Services

+ ${swarmServices.length === 0 ? '

No swarm service data

' - : `
${services.map(svc => renderServiceCard(svc)).join('')}
` + : `
${swarmServices.map(svc => renderServiceCard(svc)).join('')}
` + } +
+ +
+

Agentmon

+ ${agentmonServices.length === 0 + ? '

No agentmon service data

' + : `
${agentmonServices.map(svc => renderServiceCard(svc)).join('')}
` }
`; @@ -835,6 +845,10 @@ case 'mcp': return renderMCPCard(svc); case 'voice': return renderVoiceCard(svc); case 'automation':return renderAutomationCard(svc); + case 'api': + case 'web': return renderAPICard(svc); + case 'worker': + case 'queue': return renderWorkerCard(svc); default: return renderGenericServiceCard(svc); } } @@ -966,6 +980,33 @@ `; } + function renderAPICard(svc) { + const httpStatus = svc.http_status; + const httpClass = httpStatus === 200 ? 'ok' : httpStatus ? 'bad' : ''; + return ` +
+ ${serviceCardHeader(svc)} +
+ ${serviceStatRow('HTTP', httpStatus ? String(httpStatus) : '-', httpClass)} + ${serviceStatRow('Uptime', formatUptime(svc.uptime_sec), '')} + ${serviceStatRow('Container', escapeHTML(svc.container_state || '-'), svc.container_state === 'running' ? 'ok' : 'bad')} +
+
+ `; + } + + function renderWorkerCard(svc) { + return ` +
+ ${serviceCardHeader(svc)} +
+ ${serviceStatRow('Container', escapeHTML(svc.container_state || '-'), svc.container_state === 'running' ? 'ok' : 'bad')} + ${serviceStatRow('Uptime', formatUptime(svc.uptime_sec), '')} +
+
+ `; + } + function renderGenericServiceCard(svc) { return `
diff --git a/docker-compose.yaml b/docker-compose.yaml index 2cde8e8..47217ef 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -15,6 +15,10 @@ services: interval: 5s timeout: 5s retries: 5 + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: db nats: image: nats:latest @@ -24,6 +28,10 @@ services: command: "--jetstream" volumes: - nats-data:/data + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: queue ingest-gateway: build: . @@ -38,6 +46,11 @@ services: AGENTMON_ADDR: :8080 NATS_URL: nats://nats:4222 NATS_TOPIC: agentmon.events.v1 + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: api + agentmon.port: "8080" query-api: build: . @@ -56,6 +69,11 @@ services: AGENTMON_QUERY_BASE: http://localhost:8081 NATS_URL: nats://nats:4222 NATS_TOPIC: agentmon.events.v1 + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: api + agentmon.port: "8081" web-ui: build: . @@ -69,6 +87,11 @@ services: environment: AGENTMON_UI_ADDR: :8082 AGENTMON_QUERY_BASE: http://query-api:8081 + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: api + agentmon.port: "8082" event-processor: build: . @@ -83,6 +106,10 @@ services: DATABASE_URL: postgres://postgres:pass@postgres:5432/agentmon?sslmode=disable NATS_URL: nats://nats:4222 NATS_TOPIC: agentmon.events.v1 + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: worker swarm-monitor: build: . @@ -100,6 +127,10 @@ services: LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY:-} volumes: - /var/run/docker.sock:/var/run/docker.sock:ro + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: worker openclaw-monitor: build: . @@ -121,6 +152,10 @@ services: - /home/will/.ssh/id_rsa.pub:/root/.ssh/id_rsa.pub:ro - /home/will/.ssh/authorized_keys:/root/.ssh/authorized_keys:ro - /var/lib/libvirt:/var/lib/libvirt:ro + labels: + agentmon.monitor: "true" + agentmon.group: agentmon + agentmon.role: worker volumes: postgres-data: diff --git a/internal/monitor/swarm/collector.go b/internal/monitor/swarm/collector.go index 41b435d..6425e3c 100644 --- a/internal/monitor/swarm/collector.go +++ b/internal/monitor/swarm/collector.go @@ -87,6 +87,7 @@ func collectOne(ctx context.Context, c dockerContainer, dockerClient, httpClient name := containerName(c) snap := ServiceSnapshot{ Name: name, + Group: c.Labels["agentmon.group"], Role: c.Labels["agentmon.role"], ContainerState: c.State, HealthState: "none", @@ -121,6 +122,10 @@ func collectOne(ctx context.Context, c dockerContainer, dockerClient, httpClient collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/") case "mcp": collectPortProbe(&snap, port) + case "api", "web": + if port != "" { + collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/healthz") + } } snap.Status = deriveStatus(snap) diff --git a/internal/monitor/swarm/types.go b/internal/monitor/swarm/types.go index 1ef8f1f..2af8395 100644 --- a/internal/monitor/swarm/types.go +++ b/internal/monitor/swarm/types.go @@ -5,6 +5,7 @@ import "time" // ServiceSnapshot holds the collected state for one docker-compose service. type ServiceSnapshot struct { Name string `json:"name"` + Group string `json:"group,omitempty"` Role string `json:"role"` ContainerState string `json:"container_state"` // running/stopped/exited/missing HealthState string `json:"health_state"` // healthy/unhealthy/starting/none diff --git a/internal/store/postgres/runs.go b/internal/store/postgres/runs.go index dcffddb..81e8b81 100644 --- a/internal/store/postgres/runs.go +++ b/internal/store/postgres/runs.go @@ -13,6 +13,8 @@ type RunRow struct { EndedAt *time.Time `json:"ended_at,omitempty"` Status string `json:"status"` SpanCount int `json:"span_count"` + ToolCount int `json:"tool_count"` + Model string `json:"model,omitempty"` } type SessionDetail struct { @@ -59,7 +61,9 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session WHEN bool_or(type = 'error' OR payload->'payload'->>'status' = 'error') THEN 'error' ELSE 'success' END as status, - COUNT(DISTINCT span_id) as span_count + COUNT(DISTINCT span_id) as span_count, + COUNT(DISTINCT CASE WHEN payload->'attributes'->>'span_kind' = 'tool' THEN span_id END) as tool_count, + COALESCE(MAX(CASE WHEN type = 'run.end' THEN payload->'payload'->>'model' END), '') as model FROM events WHERE session_id = $1 AND run_id IS NOT NULL GROUP BY run_id, session_id @@ -74,7 +78,7 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session var runs []RunRow for rows.Next() { var r RunRow - if err := rows.Scan(&r.RunID, &r.SessionID, &r.StartedAt, &r.EndedAt, &r.Status, &r.SpanCount); err != nil { + if err := rows.Scan(&r.RunID, &r.SessionID, &r.StartedAt, &r.EndedAt, &r.Status, &r.SpanCount, &r.ToolCount, &r.Model); err != nil { return nil, nil, err } runs = append(runs, r) diff --git a/internal/store/postgres/stats.go b/internal/store/postgres/stats.go index 77943f3..375e78c 100644 --- a/internal/store/postgres/stats.go +++ b/internal/store/postgres/stats.go @@ -99,6 +99,48 @@ func (d *DB) GetSummary(ctx context.Context) (*Summary, error) { }, nil } +type TopTool struct { + Name string `json:"name"` + Count int `json:"count"` +} + +func (d *DB) GetTopTools(ctx context.Context, limit int) ([]TopTool, error) { + if limit <= 0 { + limit = 10 + } + now := time.Now() + midnight := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location()) + + q := ` + SELECT + payload->'attributes'->>'name' AS tool_name, + COUNT(*) AS cnt + FROM events + WHERE type = 'span.end' + AND payload->'attributes'->>'span_kind' = 'tool' + AND payload->'attributes'->>'name' IS NOT NULL + AND ts >= $1 + GROUP BY tool_name + ORDER BY cnt DESC + LIMIT $2 + ` + rows, err := d.sql.QueryContext(ctx, q, midnight, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + var out []TopTool + for rows.Next() { + var t TopTool + if err := rows.Scan(&t.Name, &t.Count); err != nil { + return nil, err + } + out = append(out, t) + } + return out, rows.Err() +} + func bucketForWindow(window string) string { switch window { case "1h":