feat: add agentmon services section to infrastructure page

Label all agentmon docker-compose services with agentmon.monitor=true
and agentmon.group=agentmon so the swarm-monitor picks them up.
Adds Group field to ServiceSnapshot, probes /healthz for api/web roles,
and renders a separate "Agentmon" section below Swarm Services on the
Infrastructure page with new api and worker card renderers.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
William Valentin
2026-03-18 13:41:26 -07:00
parent d2d044a3d8
commit f8ddea3698
7 changed files with 147 additions and 6 deletions
+13
View File
@@ -218,6 +218,19 @@ func main() {
httpx.WriteJSON(w, http.StatusOK, summary) httpx.WriteJSON(w, http.StatusOK, summary)
}) })
r.Get("/v1/stats/top-tools", func(w http.ResponseWriter, r *http.Request) {
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
tools, err := db.GetTopTools(r.Context(), limit)
if err != nil {
httpx.WriteJSON(w, http.StatusInternalServerError, map[string]any{"error": "db_error"})
return
}
if tools == nil {
tools = []postgres.TopTool{}
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"tools": tools})
})
r.Get("/v1/stats/timeseries", func(w http.ResponseWriter, r *http.Request) { r.Get("/v1/stats/timeseries", func(w http.ResponseWriter, r *http.Request) {
window := r.URL.Query().Get("window") window := r.URL.Query().Get("window")
switch window { switch window {
+45 -4
View File
@@ -751,7 +751,9 @@
function renderInfraGrid() { function renderInfraGrid() {
const vmNames = Object.keys(openclawState.instances).sort(); const vmNames = Object.keys(openclawState.instances).sort();
const services = Object.values(swarmState.services); const allServices = Object.values(swarmState.services);
const agentmonServices = allServices.filter(s => s.group === 'agentmon');
const swarmServices = allServices.filter(s => s.group !== 'agentmon');
app.innerHTML = ` app.innerHTML = `
<div class="page-header"> <div class="page-header">
@@ -767,10 +769,18 @@
</div> </div>
<div class="infra-section"> <div class="infra-section">
<p class="infra-section-title">Services</p> <p class="infra-section-title">Swarm Services</p>
${services.length === 0 ${swarmServices.length === 0
? '<p class="empty-state">No swarm service data</p>' ? '<p class="empty-state">No swarm service data</p>'
: `<div class="service-grid">${services.map(svc => renderServiceCard(svc)).join('')}</div>` : `<div class="service-grid">${swarmServices.map(svc => renderServiceCard(svc)).join('')}</div>`
}
</div>
<div class="infra-section">
<p class="infra-section-title">Agentmon</p>
${agentmonServices.length === 0
? '<p class="empty-state">No agentmon service data</p>'
: `<div class="service-grid">${agentmonServices.map(svc => renderServiceCard(svc)).join('')}</div>`
} }
</div> </div>
`; `;
@@ -835,6 +845,10 @@
case 'mcp': return renderMCPCard(svc); case 'mcp': return renderMCPCard(svc);
case 'voice': return renderVoiceCard(svc); case 'voice': return renderVoiceCard(svc);
case 'automation':return renderAutomationCard(svc); case 'automation':return renderAutomationCard(svc);
case 'api':
case 'web': return renderAPICard(svc);
case 'worker':
case 'queue': return renderWorkerCard(svc);
default: return renderGenericServiceCard(svc); default: return renderGenericServiceCard(svc);
} }
} }
@@ -966,6 +980,33 @@
`; `;
} }
function renderAPICard(svc) {
const httpStatus = svc.http_status;
const httpClass = httpStatus === 200 ? 'ok' : httpStatus ? 'bad' : '';
return `
<div class="service-card">
${serviceCardHeader(svc)}
<div class="service-stats">
${serviceStatRow('HTTP', httpStatus ? String(httpStatus) : '-', httpClass)}
${serviceStatRow('Uptime', formatUptime(svc.uptime_sec), '')}
${serviceStatRow('Container', escapeHTML(svc.container_state || '-'), svc.container_state === 'running' ? 'ok' : 'bad')}
</div>
</div>
`;
}
function renderWorkerCard(svc) {
return `
<div class="service-card">
${serviceCardHeader(svc)}
<div class="service-stats">
${serviceStatRow('Container', escapeHTML(svc.container_state || '-'), svc.container_state === 'running' ? 'ok' : 'bad')}
${serviceStatRow('Uptime', formatUptime(svc.uptime_sec), '')}
</div>
</div>
`;
}
function renderGenericServiceCard(svc) { function renderGenericServiceCard(svc) {
return ` return `
<div class="service-card"> <div class="service-card">
+35
View File
@@ -15,6 +15,10 @@ services:
interval: 5s interval: 5s
timeout: 5s timeout: 5s
retries: 5 retries: 5
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: db
nats: nats:
image: nats:latest image: nats:latest
@@ -24,6 +28,10 @@ services:
command: "--jetstream" command: "--jetstream"
volumes: volumes:
- nats-data:/data - nats-data:/data
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: queue
ingest-gateway: ingest-gateway:
build: . build: .
@@ -38,6 +46,11 @@ services:
AGENTMON_ADDR: :8080 AGENTMON_ADDR: :8080
NATS_URL: nats://nats:4222 NATS_URL: nats://nats:4222
NATS_TOPIC: agentmon.events.v1 NATS_TOPIC: agentmon.events.v1
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: api
agentmon.port: "8080"
query-api: query-api:
build: . build: .
@@ -56,6 +69,11 @@ services:
AGENTMON_QUERY_BASE: http://localhost:8081 AGENTMON_QUERY_BASE: http://localhost:8081
NATS_URL: nats://nats:4222 NATS_URL: nats://nats:4222
NATS_TOPIC: agentmon.events.v1 NATS_TOPIC: agentmon.events.v1
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: api
agentmon.port: "8081"
web-ui: web-ui:
build: . build: .
@@ -69,6 +87,11 @@ services:
environment: environment:
AGENTMON_UI_ADDR: :8082 AGENTMON_UI_ADDR: :8082
AGENTMON_QUERY_BASE: http://query-api:8081 AGENTMON_QUERY_BASE: http://query-api:8081
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: api
agentmon.port: "8082"
event-processor: event-processor:
build: . build: .
@@ -83,6 +106,10 @@ services:
DATABASE_URL: postgres://postgres:pass@postgres:5432/agentmon?sslmode=disable DATABASE_URL: postgres://postgres:pass@postgres:5432/agentmon?sslmode=disable
NATS_URL: nats://nats:4222 NATS_URL: nats://nats:4222
NATS_TOPIC: agentmon.events.v1 NATS_TOPIC: agentmon.events.v1
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: worker
swarm-monitor: swarm-monitor:
build: . build: .
@@ -100,6 +127,10 @@ services:
LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY:-} LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY:-}
volumes: volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro - /var/run/docker.sock:/var/run/docker.sock:ro
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: worker
openclaw-monitor: openclaw-monitor:
build: . build: .
@@ -121,6 +152,10 @@ services:
- /home/will/.ssh/id_rsa.pub:/root/.ssh/id_rsa.pub:ro - /home/will/.ssh/id_rsa.pub:/root/.ssh/id_rsa.pub:ro
- /home/will/.ssh/authorized_keys:/root/.ssh/authorized_keys:ro - /home/will/.ssh/authorized_keys:/root/.ssh/authorized_keys:ro
- /var/lib/libvirt:/var/lib/libvirt:ro - /var/lib/libvirt:/var/lib/libvirt:ro
labels:
agentmon.monitor: "true"
agentmon.group: agentmon
agentmon.role: worker
volumes: volumes:
postgres-data: postgres-data:
+5
View File
@@ -87,6 +87,7 @@ func collectOne(ctx context.Context, c dockerContainer, dockerClient, httpClient
name := containerName(c) name := containerName(c)
snap := ServiceSnapshot{ snap := ServiceSnapshot{
Name: name, Name: name,
Group: c.Labels["agentmon.group"],
Role: c.Labels["agentmon.role"], Role: c.Labels["agentmon.role"],
ContainerState: c.State, ContainerState: c.State,
HealthState: "none", HealthState: "none",
@@ -121,6 +122,10 @@ func collectOne(ctx context.Context, c dockerContainer, dockerClient, httpClient
collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/") collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/")
case "mcp": case "mcp":
collectPortProbe(&snap, port) collectPortProbe(&snap, port)
case "api", "web":
if port != "" {
collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/healthz")
}
} }
snap.Status = deriveStatus(snap) snap.Status = deriveStatus(snap)
+1
View File
@@ -5,6 +5,7 @@ import "time"
// ServiceSnapshot holds the collected state for one docker-compose service. // ServiceSnapshot holds the collected state for one docker-compose service.
type ServiceSnapshot struct { type ServiceSnapshot struct {
Name string `json:"name"` Name string `json:"name"`
Group string `json:"group,omitempty"`
Role string `json:"role"` Role string `json:"role"`
ContainerState string `json:"container_state"` // running/stopped/exited/missing ContainerState string `json:"container_state"` // running/stopped/exited/missing
HealthState string `json:"health_state"` // healthy/unhealthy/starting/none HealthState string `json:"health_state"` // healthy/unhealthy/starting/none
+6 -2
View File
@@ -13,6 +13,8 @@ type RunRow struct {
EndedAt *time.Time `json:"ended_at,omitempty"` EndedAt *time.Time `json:"ended_at,omitempty"`
Status string `json:"status"` Status string `json:"status"`
SpanCount int `json:"span_count"` SpanCount int `json:"span_count"`
ToolCount int `json:"tool_count"`
Model string `json:"model,omitempty"`
} }
type SessionDetail struct { type SessionDetail struct {
@@ -59,7 +61,9 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session
WHEN bool_or(type = 'error' OR payload->'payload'->>'status' = 'error') THEN 'error' WHEN bool_or(type = 'error' OR payload->'payload'->>'status' = 'error') THEN 'error'
ELSE 'success' ELSE 'success'
END as status, END as status,
COUNT(DISTINCT span_id) as span_count COUNT(DISTINCT span_id) as span_count,
COUNT(DISTINCT CASE WHEN payload->'attributes'->>'span_kind' = 'tool' THEN span_id END) as tool_count,
COALESCE(MAX(CASE WHEN type = 'run.end' THEN payload->'payload'->>'model' END), '') as model
FROM events FROM events
WHERE session_id = $1 AND run_id IS NOT NULL WHERE session_id = $1 AND run_id IS NOT NULL
GROUP BY run_id, session_id GROUP BY run_id, session_id
@@ -74,7 +78,7 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session
var runs []RunRow var runs []RunRow
for rows.Next() { for rows.Next() {
var r RunRow var r RunRow
if err := rows.Scan(&r.RunID, &r.SessionID, &r.StartedAt, &r.EndedAt, &r.Status, &r.SpanCount); err != nil { if err := rows.Scan(&r.RunID, &r.SessionID, &r.StartedAt, &r.EndedAt, &r.Status, &r.SpanCount, &r.ToolCount, &r.Model); err != nil {
return nil, nil, err return nil, nil, err
} }
runs = append(runs, r) runs = append(runs, r)
+42
View File
@@ -99,6 +99,48 @@ func (d *DB) GetSummary(ctx context.Context) (*Summary, error) {
}, nil }, nil
} }
type TopTool struct {
Name string `json:"name"`
Count int `json:"count"`
}
func (d *DB) GetTopTools(ctx context.Context, limit int) ([]TopTool, error) {
if limit <= 0 {
limit = 10
}
now := time.Now()
midnight := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location())
q := `
SELECT
payload->'attributes'->>'name' AS tool_name,
COUNT(*) AS cnt
FROM events
WHERE type = 'span.end'
AND payload->'attributes'->>'span_kind' = 'tool'
AND payload->'attributes'->>'name' IS NOT NULL
AND ts >= $1
GROUP BY tool_name
ORDER BY cnt DESC
LIMIT $2
`
rows, err := d.sql.QueryContext(ctx, q, midnight, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var out []TopTool
for rows.Next() {
var t TopTool
if err := rows.Scan(&t.Name, &t.Count); err != nil {
return nil, err
}
out = append(out, t)
}
return out, rows.Err()
}
func bucketForWindow(window string) string { func bucketForWindow(window string) string {
switch window { switch window {
case "1h": case "1h":