feat: add agentmon services section to infrastructure page
Label all agentmon docker-compose services with agentmon.monitor=true and agentmon.group=agentmon so the swarm-monitor picks them up. Adds Group field to ServiceSnapshot, probes /healthz for api/web roles, and renders a separate "Agentmon" section below Swarm Services on the Infrastructure page with new api and worker card renderers. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -218,6 +218,19 @@ func main() {
|
||||
httpx.WriteJSON(w, http.StatusOK, summary)
|
||||
})
|
||||
|
||||
r.Get("/v1/stats/top-tools", func(w http.ResponseWriter, r *http.Request) {
|
||||
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
|
||||
tools, err := db.GetTopTools(r.Context(), limit)
|
||||
if err != nil {
|
||||
httpx.WriteJSON(w, http.StatusInternalServerError, map[string]any{"error": "db_error"})
|
||||
return
|
||||
}
|
||||
if tools == nil {
|
||||
tools = []postgres.TopTool{}
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"tools": tools})
|
||||
})
|
||||
|
||||
r.Get("/v1/stats/timeseries", func(w http.ResponseWriter, r *http.Request) {
|
||||
window := r.URL.Query().Get("window")
|
||||
switch window {
|
||||
|
||||
@@ -751,7 +751,9 @@
|
||||
|
||||
function renderInfraGrid() {
|
||||
const vmNames = Object.keys(openclawState.instances).sort();
|
||||
const services = Object.values(swarmState.services);
|
||||
const allServices = Object.values(swarmState.services);
|
||||
const agentmonServices = allServices.filter(s => s.group === 'agentmon');
|
||||
const swarmServices = allServices.filter(s => s.group !== 'agentmon');
|
||||
|
||||
app.innerHTML = `
|
||||
<div class="page-header">
|
||||
@@ -767,10 +769,18 @@
|
||||
</div>
|
||||
|
||||
<div class="infra-section">
|
||||
<p class="infra-section-title">Services</p>
|
||||
${services.length === 0
|
||||
<p class="infra-section-title">Swarm Services</p>
|
||||
${swarmServices.length === 0
|
||||
? '<p class="empty-state">No swarm service data</p>'
|
||||
: `<div class="service-grid">${services.map(svc => renderServiceCard(svc)).join('')}</div>`
|
||||
: `<div class="service-grid">${swarmServices.map(svc => renderServiceCard(svc)).join('')}</div>`
|
||||
}
|
||||
</div>
|
||||
|
||||
<div class="infra-section">
|
||||
<p class="infra-section-title">Agentmon</p>
|
||||
${agentmonServices.length === 0
|
||||
? '<p class="empty-state">No agentmon service data</p>'
|
||||
: `<div class="service-grid">${agentmonServices.map(svc => renderServiceCard(svc)).join('')}</div>`
|
||||
}
|
||||
</div>
|
||||
`;
|
||||
@@ -835,6 +845,10 @@
|
||||
case 'mcp': return renderMCPCard(svc);
|
||||
case 'voice': return renderVoiceCard(svc);
|
||||
case 'automation':return renderAutomationCard(svc);
|
||||
case 'api':
|
||||
case 'web': return renderAPICard(svc);
|
||||
case 'worker':
|
||||
case 'queue': return renderWorkerCard(svc);
|
||||
default: return renderGenericServiceCard(svc);
|
||||
}
|
||||
}
|
||||
@@ -966,6 +980,33 @@
|
||||
`;
|
||||
}
|
||||
|
||||
function renderAPICard(svc) {
|
||||
const httpStatus = svc.http_status;
|
||||
const httpClass = httpStatus === 200 ? 'ok' : httpStatus ? 'bad' : '';
|
||||
return `
|
||||
<div class="service-card">
|
||||
${serviceCardHeader(svc)}
|
||||
<div class="service-stats">
|
||||
${serviceStatRow('HTTP', httpStatus ? String(httpStatus) : '-', httpClass)}
|
||||
${serviceStatRow('Uptime', formatUptime(svc.uptime_sec), '')}
|
||||
${serviceStatRow('Container', escapeHTML(svc.container_state || '-'), svc.container_state === 'running' ? 'ok' : 'bad')}
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function renderWorkerCard(svc) {
|
||||
return `
|
||||
<div class="service-card">
|
||||
${serviceCardHeader(svc)}
|
||||
<div class="service-stats">
|
||||
${serviceStatRow('Container', escapeHTML(svc.container_state || '-'), svc.container_state === 'running' ? 'ok' : 'bad')}
|
||||
${serviceStatRow('Uptime', formatUptime(svc.uptime_sec), '')}
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function renderGenericServiceCard(svc) {
|
||||
return `
|
||||
<div class="service-card">
|
||||
|
||||
@@ -15,6 +15,10 @@ services:
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: db
|
||||
|
||||
nats:
|
||||
image: nats:latest
|
||||
@@ -24,6 +28,10 @@ services:
|
||||
command: "--jetstream"
|
||||
volumes:
|
||||
- nats-data:/data
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: queue
|
||||
|
||||
ingest-gateway:
|
||||
build: .
|
||||
@@ -38,6 +46,11 @@ services:
|
||||
AGENTMON_ADDR: :8080
|
||||
NATS_URL: nats://nats:4222
|
||||
NATS_TOPIC: agentmon.events.v1
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: api
|
||||
agentmon.port: "8080"
|
||||
|
||||
query-api:
|
||||
build: .
|
||||
@@ -56,6 +69,11 @@ services:
|
||||
AGENTMON_QUERY_BASE: http://localhost:8081
|
||||
NATS_URL: nats://nats:4222
|
||||
NATS_TOPIC: agentmon.events.v1
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: api
|
||||
agentmon.port: "8081"
|
||||
|
||||
web-ui:
|
||||
build: .
|
||||
@@ -69,6 +87,11 @@ services:
|
||||
environment:
|
||||
AGENTMON_UI_ADDR: :8082
|
||||
AGENTMON_QUERY_BASE: http://query-api:8081
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: api
|
||||
agentmon.port: "8082"
|
||||
|
||||
event-processor:
|
||||
build: .
|
||||
@@ -83,6 +106,10 @@ services:
|
||||
DATABASE_URL: postgres://postgres:pass@postgres:5432/agentmon?sslmode=disable
|
||||
NATS_URL: nats://nats:4222
|
||||
NATS_TOPIC: agentmon.events.v1
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: worker
|
||||
|
||||
swarm-monitor:
|
||||
build: .
|
||||
@@ -100,6 +127,10 @@ services:
|
||||
LITELLM_MASTER_KEY: ${LITELLM_MASTER_KEY:-}
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: worker
|
||||
|
||||
openclaw-monitor:
|
||||
build: .
|
||||
@@ -121,6 +152,10 @@ services:
|
||||
- /home/will/.ssh/id_rsa.pub:/root/.ssh/id_rsa.pub:ro
|
||||
- /home/will/.ssh/authorized_keys:/root/.ssh/authorized_keys:ro
|
||||
- /var/lib/libvirt:/var/lib/libvirt:ro
|
||||
labels:
|
||||
agentmon.monitor: "true"
|
||||
agentmon.group: agentmon
|
||||
agentmon.role: worker
|
||||
|
||||
volumes:
|
||||
postgres-data:
|
||||
|
||||
@@ -87,6 +87,7 @@ func collectOne(ctx context.Context, c dockerContainer, dockerClient, httpClient
|
||||
name := containerName(c)
|
||||
snap := ServiceSnapshot{
|
||||
Name: name,
|
||||
Group: c.Labels["agentmon.group"],
|
||||
Role: c.Labels["agentmon.role"],
|
||||
ContainerState: c.State,
|
||||
HealthState: "none",
|
||||
@@ -121,6 +122,10 @@ func collectOne(ctx context.Context, c dockerContainer, dockerClient, httpClient
|
||||
collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/")
|
||||
case "mcp":
|
||||
collectPortProbe(&snap, port)
|
||||
case "api", "web":
|
||||
if port != "" {
|
||||
collectHTTPProbe(ctx, &snap, httpClient, "http://localhost:"+port+"/healthz")
|
||||
}
|
||||
}
|
||||
|
||||
snap.Status = deriveStatus(snap)
|
||||
|
||||
@@ -5,6 +5,7 @@ import "time"
|
||||
// ServiceSnapshot holds the collected state for one docker-compose service.
|
||||
type ServiceSnapshot struct {
|
||||
Name string `json:"name"`
|
||||
Group string `json:"group,omitempty"`
|
||||
Role string `json:"role"`
|
||||
ContainerState string `json:"container_state"` // running/stopped/exited/missing
|
||||
HealthState string `json:"health_state"` // healthy/unhealthy/starting/none
|
||||
|
||||
@@ -13,6 +13,8 @@ type RunRow struct {
|
||||
EndedAt *time.Time `json:"ended_at,omitempty"`
|
||||
Status string `json:"status"`
|
||||
SpanCount int `json:"span_count"`
|
||||
ToolCount int `json:"tool_count"`
|
||||
Model string `json:"model,omitempty"`
|
||||
}
|
||||
|
||||
type SessionDetail struct {
|
||||
@@ -59,7 +61,9 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session
|
||||
WHEN bool_or(type = 'error' OR payload->'payload'->>'status' = 'error') THEN 'error'
|
||||
ELSE 'success'
|
||||
END as status,
|
||||
COUNT(DISTINCT span_id) as span_count
|
||||
COUNT(DISTINCT span_id) as span_count,
|
||||
COUNT(DISTINCT CASE WHEN payload->'attributes'->>'span_kind' = 'tool' THEN span_id END) as tool_count,
|
||||
COALESCE(MAX(CASE WHEN type = 'run.end' THEN payload->'payload'->>'model' END), '') as model
|
||||
FROM events
|
||||
WHERE session_id = $1 AND run_id IS NOT NULL
|
||||
GROUP BY run_id, session_id
|
||||
@@ -74,7 +78,7 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session
|
||||
var runs []RunRow
|
||||
for rows.Next() {
|
||||
var r RunRow
|
||||
if err := rows.Scan(&r.RunID, &r.SessionID, &r.StartedAt, &r.EndedAt, &r.Status, &r.SpanCount); err != nil {
|
||||
if err := rows.Scan(&r.RunID, &r.SessionID, &r.StartedAt, &r.EndedAt, &r.Status, &r.SpanCount, &r.ToolCount, &r.Model); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
runs = append(runs, r)
|
||||
|
||||
@@ -99,6 +99,48 @@ func (d *DB) GetSummary(ctx context.Context) (*Summary, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
type TopTool struct {
|
||||
Name string `json:"name"`
|
||||
Count int `json:"count"`
|
||||
}
|
||||
|
||||
func (d *DB) GetTopTools(ctx context.Context, limit int) ([]TopTool, error) {
|
||||
if limit <= 0 {
|
||||
limit = 10
|
||||
}
|
||||
now := time.Now()
|
||||
midnight := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location())
|
||||
|
||||
q := `
|
||||
SELECT
|
||||
payload->'attributes'->>'name' AS tool_name,
|
||||
COUNT(*) AS cnt
|
||||
FROM events
|
||||
WHERE type = 'span.end'
|
||||
AND payload->'attributes'->>'span_kind' = 'tool'
|
||||
AND payload->'attributes'->>'name' IS NOT NULL
|
||||
AND ts >= $1
|
||||
GROUP BY tool_name
|
||||
ORDER BY cnt DESC
|
||||
LIMIT $2
|
||||
`
|
||||
rows, err := d.sql.QueryContext(ctx, q, midnight, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var out []TopTool
|
||||
for rows.Next() {
|
||||
var t TopTool
|
||||
if err := rows.Scan(&t.Name, &t.Count); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, t)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func bucketForWindow(window string) string {
|
||||
switch window {
|
||||
case "1h":
|
||||
|
||||
Reference in New Issue
Block a user