diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f379ee8 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# Local development configuration +# Copy this file to .env and adjust as needed + +# Postgres database (running via docker-compose) +DATABASE_URL=postgres://postgres:pass@localhost:5432/agentmon?sslmode=disable + +# NATS message queue (running via docker-compose) +NATS_URL=nats://localhost:4222 + +# NATS topic for events +NATS_TOPIC=agentmon.events.v1 + +# Query API base URL (for web-ui proxy) +AGENTMON_QUERY_BASE=http://localhost:8081 diff --git a/.gitignore b/.gitignore index e458ed5..48d471b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ .worktrees/ +.env +/ingest-gateway +/query-api +/web-ui diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..8a13d52 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,166 @@ +# AGENTS.md + +This file provides guidelines for agentic coding agents working on the agentmon repository. + +## Build/Lint/Test Commands + +```bash +# Run all tests +go test ./... + +# Run tests for a specific package +go test ./internal/event + +# Run a single test +go test ./internal/event -run TestValidate_ValidEvent + +# Run tests with verbose output +go test -v ./... + +# Tidy dependencies +go mod tidy + +# Run services via Makefile +make tidy +make test +make run-ingest # Ingest gateway (requires NATS_URL, NATS_TOPIC) +make run-query # Query API (requires DATABASE_URL) +make run-ui # Web UI +make run-processor # Event processor (requires DATABASE_URL, NATS_URL, NATS_TOPIC) + +# Build executables +go build -o ingest-gateway ./cmd/ingest-gateway +go build -o query-api ./cmd/query-api +go build -o web-ui ./cmd/web-ui +``` + +## Code Style Guidelines + +### Imports +- Order: stdlib, internal packages, external packages +- Group by blank line between each section +- No unused imports + +Example: +```go +import ( + "context" + "database/sql" + + "agentmon/internal/event" + "agentmon/internal/httpx" + + "github.com/go-chi/chi/v5" + "github.com/jackc/pgx/v5" +) +``` + +### Formatting +- Use `go fmt` or enable auto-formatting +- Standard Go formatting rules apply +- No inline comments unless necessary + +### Types +- Use `any` for generic types (not `interface{}`) +- Use pointer types (`*int64`) for optional JSON fields +- Struct tags for JSON serialization: `json:"field_name,omitempty"` +- Use `sql.ErrNoRows` for "not found" database errors + +### Naming Conventions +- Exported: CamelCase (e.g., `ValidationError`, `Publish`) +- Unexported: camelCase (e.g., `db`, `validate`) +- Acronyms: keep uppercase (e.g., `DB`, `NATS`, `URL`) +- Constants: CamelCase (e.g., `validTypes`) +- Test functions: `Test_` + +### Error Handling +- Always check errors, don't ignore +- Use `log.Fatalf` for startup errors (main package only) +- Use `errors.As()` for type assertions: `errors.As(err, &ve)` +- Custom error types must implement `Error() string` method +- Return errors from functions, handle at call site +- HTTP errors: return JSON with error field, appropriate status code + +Example: +```go +if err != nil { + return nil, fmt.Errorf("operation failed: %w", err) +} + +if ve, ok := err.(ValidationError); ok { + return ValidationError{Field: "field", Message: "message"} +} +``` + +### Database +- Use `context.Context` for all DB operations +- Use pgx/v5 via stdlib interface: `sql.Open("pgx", url)` +- Check `sql.ErrNoRows` explicitly for not-found cases +- Always defer `db.Close()` in main functions + +### HTTP +- Use chi router with middleware +- Standard middleware chain: `RequestID`, `RealIP`, `Logger`, `Recoverer` +- Health check endpoint: `GET /healthz` returns 200 with "ok" +- JSON responses: use `httpx.WriteJSON(w, status, data)` +- Get path params: `chi.URLParam(r, "paramName")` +- Get query params: `r.URL.Query().Get("key")` + +### Configuration +- Use environment variables for configuration +- Helper pattern: `envDefault(key, defaultValue)` function +- Required env vars: log.Fatal if missing +- Optional env vars: provide sensible defaults + +### Validation +- Validate all input (HTTP, events) +- Return structured errors with field path and message +- Type assertion with comma-ok for error type checking +- Valid event types: `session.start`, `session.end`, `run.start`, `run.end`, `span.start`, `span.end`, `error`, `metric.snapshot` + +### Testing +- Use standard `testing` package +- Test file naming: `*_test.go` +- Test function naming: `Test_` +- Use `t.Fatalf` for setup failures +- Use `t.Fatal` for assertion failures (not `t.Error`) +- Minimal test structure: setup, act, assert + +Example: +```go +func TestValidate_ValidEvent(t *testing.T) { + raw := `{"schema": {"name": "agentmon.event", "version": 1}}` + var m map[string]any + _ = json.Unmarshal([]byte(raw), &m) + + err := Validate(m) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } +} +``` + +### Context Usage +- Pass `context.Context` to functions that do I/O or external calls +- Use `context.WithTimeout` for operations with deadlines +- Defer cancel functions +- Example: `ctx, cancel := context.WithTimeout(ctx, 5*time.Second)` + +### Package Structure +- `cmd/`: Executable entry points (main packages) +- `internal/`: Internal packages not imported by external code +- `internal/event/`: Event schema and validation +- `internal/httpx/`: HTTP utilities +- `internal/store/postgres/`: Database operations +- `internal/queue/nats/`: NATS publishing + +### JSON Handling +- Decode to `map[string]any` for flexible event processing +- Type assertions: `if v, ok := m["key"].(string); ok {}` +- Use `json.RawMessage` for buffering JSON data +- JSON encoder/decoder for I/O + +### Logging +- Use `log.Printf` for general logging +- Use `log.Fatalf` for unrecoverable errors in main +- Minimal logging in packages (prefer returning errors) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4cb745c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,37 @@ +FROM golang:1.25 AS builder + +WORKDIR /app + +COPY go.mod go.sum ./ +RUN go mod download + +COPY . . + +RUN CGO_ENABLED=0 GOOS=linux go build -o /usr/local/bin/ingest-gateway ./cmd/ingest-gateway +RUN CGO_ENABLED=0 GOOS=linux go build -o /usr/local/bin/query-api ./cmd/query-api +RUN CGO_ENABLED=0 GOOS=linux go build -o /usr/local/bin/web-ui ./cmd/web-ui +RUN CGO_ENABLED=0 GOOS=linux go build -o /usr/local/bin/event-processor ./cmd/event-processor +RUN CGO_ENABLED=0 GOOS=linux go build -o /usr/local/bin/openclaw-monitor ./cmd/openclaw-monitor + +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y \ + ca-certificates \ + libvirt-clients \ + openssh-client \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY --from=builder /usr/local/bin/* /usr/local/bin/ + +ENV AGENTMON_ADDR=:8080 +ENV AGENTMON_QUERY_ADDR=:8081 +ENV AGENTMON_UI_ADDR=:8082 +ENV DATABASE_URL=postgres://postgres:pass@postgres:5432/agentmon?sslmode=disable +ENV NATS_URL=nats://nats:4222 +ENV NATS_TOPIC=agentmon.events.v1 +ENV OPENCLAW_REGISTRY=/openclaw-registry/openclaw-instances.json +ENV POLL_INTERVAL=30s + +CMD ["ingest-gateway"] diff --git a/Makefile b/Makefile index c6537b8..2176974 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,10 @@ -.PHONY: tidy test run-ingest run-query run-ui run-processor +.PHONY: tidy test run-ingest run-query run-ui run-processor run-openclaw-monitor up down logs + +# Load .env file +ifneq (,$(wildcard ./.env)) + include .env + export +endif tidy: go mod tidy @@ -18,29 +24,14 @@ run-ui: run-processor: DATABASE_URL=$${DATABASE_URL:?set DATABASE_URL} NATS_URL=$${NATS_URL:-nats://nats:4222} NATS_TOPIC=$${NATS_TOPIC:-agentmon.events.v1} go run ./cmd/event-processor -tidy: - go mod tidy +run-openclaw-monitor: + NATS_URL=$${NATS_URL:-nats://nats:4222} NATS_TOPIC=$${NATS_TOPIC:-agentmon.events.v1} OPENCLAW_REGISTRY=$${OPENCLAW_REGISTRY:-/home/will/.claude/state/openclaw-instances.json} POLL_INTERVAL=$${POLL_INTERVAL:-30s} go run ./cmd/openclaw-monitor -test: - go test ./... +up: + docker-compose up -d -run-ingest: - AGENTMON_ADDR=:8080 go run ./cmd/ingest-gateway +down: + docker-compose down -run-query: - AGENTMON_QUERY_ADDR=:8081 go run ./cmd/query-api - -run-ui: - AGENTMON_UI_ADDR=:8082 go run ./cmd/web-ui - -tidy: - go mod tidy - -test: - go test ./... - -run-ingest: - AGENTMON_ADDR=:8080 go run ./cmd/ingest-gateway - -run-query: - AGENTMON_QUERY_ADDR=:8081 go run ./cmd/query-api +logs: + docker-compose logs -f diff --git a/README.md b/README.md new file mode 100644 index 0000000..8708c4a --- /dev/null +++ b/README.md @@ -0,0 +1,253 @@ +# agentmon + +Telemetry and monitoring system for AI agent activity across [OpenClaw](https://openclaw.ai/) instances running on KVM virtual machines. Captures sessions, runs, tool calls, errors, and VM health metrics — viewable in a real-time web dashboard. + +## Architecture + +``` + ┌──────────────────────────┐ + │ OpenClaw VMs │ + │ (zap, orb, sun) │ + │ │ + │ hooks/agentmon/ │ + │ → handler.ts │ + └──────────┬───────────────┘ + │ HTTP POST + ▼ +┌─────────────┐ publish ┌──────────────┐ +│ openclaw- │────────────▶│ NATS │ +│ monitor │ │ :4222 │ +│ (VM polls) │ └──────┬───────┘ +└─────────────┘ │ subscribe + ▼ + ┌──────────────────┐ + │ event-processor │ + └────────┬─────────┘ + │ INSERT + ▼ +┌─────────────┐ query ┌──────────────┐ proxy ┌──────────────┐ +│ web-ui │◀────────▶│ query-api │◀──────────│ browser │ +│ :8082 │ │ :8081 │ └──────────────┘ +└─────────────┘ └──────────────┘ + ▲ + │ + ┌────────┴───────┐ + │ PostgreSQL │ + │ :5432 │ + └────────────────┘ +``` + +**Data flow:** OpenClaw hooks emit telemetry events over HTTP to the **ingest gateway**, which publishes them to **NATS**. The **event processor** subscribes and persists events to **PostgreSQL**. The **query API** serves aggregated data (sessions, runs, spans) to the **web UI**. A separate **openclaw-monitor** polls VM health metrics (CPU, memory, disk, service status) via libvirt and SSH. + +Real-time updates flow through NATS → query-api → WebSocket → browser. + +## Services + +| Service | Port | Description | +|---------|------|-------------| +| **ingest-gateway** | 8080 | HTTP + WebSocket event ingestion, publishes to NATS | +| **query-api** | 8081 | REST API for sessions, runs, spans; WebSocket live feed | +| **web-ui** | 8082 | SPA frontend with reverse proxy to query-api | +| **event-processor** | — | NATS subscriber, persists events to Postgres | +| **openclaw-monitor** | — | Polls VM instances via libvirt/SSH, emits snapshots | +| **postgres** | 5432 | Event storage | +| **nats** | 4222 | Message queue (JetStream) | + +## Quick Start + +```bash +cp .env.example .env +make up +``` + +This starts Postgres, NATS, and all application services via Docker Compose. Open http://localhost:8082. + +For local development, start infrastructure only and run services manually: + +```bash +make up # postgres + nats +make run-ingest # terminal 1 +make run-query # terminal 2 +make run-ui # terminal 3 +make run-processor # terminal 4 +make run-openclaw-monitor # terminal 5 +``` + +Or use the convenience scripts: + +```bash +./start-all.sh # start everything +./stop-all.sh # stop everything +``` + +## Configuration + +Environment variables (see `.env.example`): + +| Variable | Default | Description | +|----------|---------|-------------| +| `DATABASE_URL` | — | Postgres connection string (required) | +| `NATS_URL` | `nats://nats:4222` | NATS server address | +| `NATS_TOPIC` | `agentmon.events.v1` | NATS topic for events | +| `AGENTMON_ADDR` | `:8080` | Ingest gateway listen address | +| `AGENTMON_QUERY_ADDR` | `:8081` | Query API listen address | +| `AGENTMON_UI_ADDR` | `:8082` | Web UI listen address | +| `AGENTMON_QUERY_BASE` | `http://query-api` | Query API URL (for web-ui proxy) | +| `OPENCLAW_REGISTRY` | `~/.claude/state/openclaw-instances.json` | VM instance registry | +| `POLL_INTERVAL` | `30s` | VM polling interval | + +## API + +### Ingest Gateway (`:8080`) + +``` +GET /healthz Health check +POST /v1/events Batch event ingestion (JSON array) +GET /v1/ws WebSocket event stream +``` + +### Query API (`:8081`) + +``` +GET /healthz Health check +GET /v1/events List events (?event_type=&framework=&limit=) +GET /v1/sessions List sessions (?from=&to=&framework=&host=&cursor=&limit=) +GET /v1/sessions/{id} Session detail with runs +GET /v1/runs/{id} Run detail with spans +GET /v1/ws WebSocket live event broadcast +``` + +## Event Schema + +Events follow the `agentmon.event` envelope format: + +```json +{ + "schema": { "name": "agentmon.event", "version": 1 }, + "event": { + "id": "uuid", + "type": "session.start", + "ts": "2026-03-13T12:00:00Z", + "source": { + "framework": "openclaw", + "client_id": "zap", + "host": "zap" + } + }, + "correlation": { + "session_id": "uuid", + "run_id": "uuid", + "span_id": "uuid" + }, + "attributes": {}, + "payload": {} +} +``` + +**Event types:** `session.start`, `session.end`, `run.start`, `run.end`, `span.start`, `span.end`, `error`, `metric.snapshot`, `openclaw.snapshot` + +## Database Schema + +```sql +CREATE TABLE events ( + event_id TEXT PRIMARY KEY, + ts TIMESTAMPTZ NOT NULL, + type TEXT NOT NULL, + session_id TEXT, + run_id TEXT, + trace_id TEXT, + span_id TEXT, + parent_span_id TEXT, + source_framework TEXT, + client_id TEXT, + payload JSONB NOT NULL +); +``` + +## OpenClaw Hook + +The `hooks/agentmon/` directory contains a TypeScript hook that captures agent activity from OpenClaw instances and emits it to the ingest gateway. It maps OpenClaw events to agentmon's session/run/span model: + +| OpenClaw Event | agentmon Event | Description | +|----------------|----------------|-------------| +| `command:new` | `session.start` | New conversation started | +| `command:stop` | `session.end` | Conversation ended | +| `command:reset` | `session.end` + `session.start` | Conversation reset | +| `message:received` | `run.start` | User message received | +| `message:sent` | `run.end` | Agent response sent | +| `tool_result_persist` | `span.end` | Tool call completed | +| `session:compact:before` | `span.start` | Context compaction started | +| `session:compact:after` | `span.end` | Context compaction finished | + +### Deploying the hook + +The hook is deployed to each VM at `~/.openclaw/hooks/agentmon/`. Two environment variables are required in `~/.openclaw/.env`: + +```bash +AGENTMON_INGEST_URL=http://192.168.122.1:8080 +AGENTMON_VM_NAME=zap # or orb, sun +``` + +Deployment is automated via Ansible — see the [swarm ansible playbook](https://gitea-http.taildb3494.ts.net/will/swarm) `playbooks/customize.yml`. + +## Go SDK + +Emit events from Go applications: + +```go +emitter, err := sdk.NewEmitter(sdk.Config{ + ServerURL: "http://localhost:8080", + Framework: "my-agent", + ClientID: "client-001", + Host: "localhost", +}) +defer emitter.Close(ctx) + +emitter.Emit(ctx, sdk.NewSessionStart(sessionID, sdk.WithSource(emitter))) +emitter.Emit(ctx, sdk.NewRunStart(sessionID, runID)) +emitter.Emit(ctx, sdk.NewRunEnd(sessionID, runID, sdk.WithPayload(map[string]any{ + "status": "success", + "duration_ms": 1234, +}))) +``` + +## Web UI + +The dashboard has four views: + +- **Sessions** — browse all agent sessions with date range and framework filters +- **Session Detail** — view runs within a session, drill into individual runs +- **OpenClaw** — real-time grid of VM health cards (state, CPU, memory, disk, issues) +- **Agents** — live timeline of agent events with statistics (message counts, tool usage, errors) + +## Development + +```bash +make test # run tests +make tidy # go mod tidy +make logs # docker compose logs +make down # stop everything +``` + +## Project Structure + +``` +cmd/ +├── ingest-gateway/ HTTP event ingestion service +├── query-api/ REST API for querying events +├── web-ui/ SPA frontend + static assets +│ └── static/ HTML, CSS, JS +├── event-processor/ NATS → Postgres persistence +└── openclaw-monitor/ VM health polling +internal/ +├── event/ Envelope types and validation +├── httpx/ HTTP response helpers +├── queue/nats/ NATS publisher and subscriber +├── store/postgres/ Database queries (sessions, runs, spans) +├── sdk/ Go client library for emitting events +└── monitor/openclaw/ VM metrics collection (libvirt, SSH) +hooks/ +└── agentmon/ OpenClaw hook (TypeScript) +deploy/ +└── k8s/ Database schema (postgres.sql) +``` diff --git a/cmd/openclaw-monitor/main.go b/cmd/openclaw-monitor/main.go new file mode 100644 index 0000000..2325fe9 --- /dev/null +++ b/cmd/openclaw-monitor/main.go @@ -0,0 +1,174 @@ +package main + +import ( + "context" + "encoding/json" + "log" + "os" + "time" + + "agentmon/internal/monitor/openclaw" + qnats "agentmon/internal/queue/nats" +) + +type Event struct { + Schema map[string]any `json:"schema"` + Event map[string]any `json:"event"` + Payload map[string]any `json:"payload"` +} + +func main() { + natsURL := envDefault("NATS_URL", "nats://nats:4222") + natsTopic := envDefault("NATS_TOPIC", "agentmon.events.v1") + registryPath := envDefault("OPENCLAW_REGISTRY", "/home/will/.claude/state/openclaw-instances.json") + interval := envDefault("POLL_INTERVAL", "30s") + + pub, err := qnats.NewPublisher(natsURL, natsTopic) + if err != nil { + log.Fatalf("failed to connect to NATS: %v", err) + } + defer pub.Close() + + pollDuration, err := time.ParseDuration(interval) + if err != nil { + log.Fatalf("invalid poll interval: %v", err) + } + + ticker := time.NewTicker(pollDuration) + defer ticker.Stop() + + ctx := context.Background() + + log.Printf("openclaw-monitor started, polling every %s", pollDuration) + + for { + select { + case <-ticker.C: + if err := pollInstances(ctx, pub, registryPath); err != nil { + log.Printf("poll error: %v", err) + } + } + } +} + +func pollInstances(ctx context.Context, pub *qnats.Publisher, registryPath string) error { + instances, err := openclaw.LoadInstances(registryPath) + if err != nil { + return err + } + + for _, instance := range instances { + metrics := openclaw.Metrics{ + Instance: instance, + Timestamp: time.Now().UTC(), + } + + hostMetrics, err := openclaw.CollectHostMetrics(instance.Domain) + if err != nil { + metrics.Error = err.Error() + emitEvent(ctx, pub, instance.Name, metrics) + continue + } + metrics.Host = hostMetrics + + if hostMetrics.State == "running" && instance.Host != nil { + guestMetrics, err := openclaw.CollectGuestMetrics(*instance.Host, instance.User) + if err != nil { + log.Printf("guest collection failed for %s: %v", instance.Name, err) + } else { + metrics.Guest = guestMetrics + } + } + + backupStatus, err := openclaw.CollectBackupStatus(instance.Name) + if err != nil { + log.Printf("backup collection failed for %s: %v", instance.Name, err) + } else { + metrics.Backup = backupStatus + } + + issues := openclaw.DetectIssues(metrics) + if anyIssues(issues) { + log.Printf("issues detected for %s: %+v", instance.Name, issues) + } + + emitEvent(ctx, pub, instance.Name, metrics) + } + + return nil +} + +func emitEvent(ctx context.Context, pub *qnats.Publisher, instanceName string, metrics openclaw.Metrics) { + event := Event{ + Schema: map[string]any{ + "name": "agentmon.openclaw", + "version": 1, + }, + Event: map[string]any{ + "id": generateID(), + "type": "openclaw.snapshot", + "ts": metrics.Timestamp.UTC().Format(time.RFC3339Nano), + }, + Payload: map[string]any{ + "instance": metrics.Instance, + "host": metrics.Host, + }, + } + + if metrics.Guest != nil { + event.Payload["guest"] = metrics.Guest + } + if metrics.Backup != nil { + event.Payload["backup"] = metrics.Backup + } + if metrics.Error != "" { + event.Payload["error"] = metrics.Error + } + + issues := openclaw.DetectIssues(metrics) + if anyIssues(issues) { + event.Payload["issues"] = issues + } + + data, err := json.Marshal(event) + if err != nil { + log.Printf("failed to marshal event for %s: %v", instanceName, err) + return + } + + if err := pub.Publish(ctx, data); err != nil { + log.Printf("failed to publish event for %s: %v", instanceName, err) + } +} + +func anyIssues(issues openclaw.Issues) bool { + return issues.GuestDiskUsageHigh || + issues.GuestMemoryUsageHigh || + issues.HostDiskUsageHigh || + issues.GatewayDown || + issues.HTTPUnhealthy || + issues.VersionMismatch || + issues.VMNotRunning || + issues.BackupStale +} + +func generateID() string { + return time.Now().Format("20060102150405") + "-" + randomString(8) +} + +func randomString(n int) string { + const chars = "abcdefghijklmnopqrstuvwxyz0123456789" + b := make([]byte, n) + for i := range b { + b[i] = chars[time.Now().Nanosecond()%len(chars)] + time.Sleep(time.Nanosecond) + } + return string(b) +} + +func envDefault(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} diff --git a/cmd/query-api/main.go b/cmd/query-api/main.go index 45e1a41..8ff2bdf 100644 --- a/cmd/query-api/main.go +++ b/cmd/query-api/main.go @@ -6,6 +6,7 @@ import ( "net/http" "os" "strconv" + "sync" "time" "agentmon/internal/httpx" @@ -13,11 +14,80 @@ import ( "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" + "github.com/gorilla/websocket" + "github.com/nats-io/nats.go" ) +var ( + wsUpgrader = websocket.Upgrader{ + CheckOrigin: func(r *http.Request) bool { return true }, + } + wsClients = make(map[*websocket.Conn]bool) + wsMu sync.RWMutex + natsConn *nats.Conn +) + +func subscribeToNATS(nc *nats.Conn) { + topic := envDefault("NATS_TOPIC", "agentmon.events.v1") + sub, err := nc.Subscribe(topic, func(msg *nats.Msg) { + wsMu.RLock() + var stale []*websocket.Conn + for conn := range wsClients { + err := conn.WriteMessage(websocket.TextMessage, msg.Data) + if err != nil { + conn.Close() + stale = append(stale, conn) + } + } + wsMu.RUnlock() + + if len(stale) > 0 { + wsMu.Lock() + for _, conn := range stale { + delete(wsClients, conn) + } + wsMu.Unlock() + } + }) + if err != nil { + log.Printf("failed to subscribe to NATS: %v", err) + return + } + log.Printf("subscribed to NATS topic: %s", topic) + _ = sub +} + +func wsHandler(w http.ResponseWriter, r *http.Request) { + conn, err := wsUpgrader.Upgrade(w, r, nil) + if err != nil { + return + } + defer conn.Close() + + wsMu.Lock() + wsClients[conn] = true + wsMu.Unlock() + + log.Printf("WebSocket client connected") + + for { + _, _, err := conn.ReadMessage() + if err != nil { + break + } + } + + wsMu.Lock() + delete(wsClients, conn) + wsMu.Unlock() + log.Printf("WebSocket client disconnected") +} + func main() { addr := envDefault("AGENTMON_QUERY_ADDR", ":8081") dsn := os.Getenv("DATABASE_URL") + natsURL := envDefault("NATS_URL", "nats://localhost:4222") + if dsn == "" { log.Fatalf("DATABASE_URL is required") } @@ -28,6 +98,14 @@ func main() { } defer func() { _ = db.Close() }() + nc, err := nats.Connect(natsURL) + if err != nil { + log.Printf("warning: failed to connect to NATS: %v", err) + } else { + natsConn = nc + go subscribeToNATS(nc) + } + r := chi.NewRouter() r.Use(middleware.RequestID) r.Use(middleware.RealIP) @@ -39,9 +117,16 @@ func main() { _, _ = w.Write([]byte("ok")) }) + r.Get("/v1/ws", wsHandler) + r.Get("/v1/events", func(w http.ResponseWriter, r *http.Request) { limit, _ := strconv.Atoi(r.URL.Query().Get("limit")) - events, err := db.ListRecentEvents(r.Context(), limit) + f := postgres.EventsFilter{ + Limit: limit, + EventType: r.URL.Query().Get("event_type"), + Framework: r.URL.Query().Get("framework"), + } + events, err := db.ListRecentEvents(r.Context(), f) if err != nil { httpx.WriteJSON(w, http.StatusInternalServerError, map[string]any{"error": "db_error"}) return diff --git a/cmd/web-ui/main.go b/cmd/web-ui/main.go index 893dc32..2ecfdd1 100644 --- a/cmd/web-ui/main.go +++ b/cmd/web-ui/main.go @@ -48,7 +48,7 @@ func main() { // SPA catch-all: serve index.html for all other routes mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { // Serve index.html for SPA routes - if r.URL.Path == "/" || strings.HasPrefix(r.URL.Path, "/sessions") || strings.HasPrefix(r.URL.Path, "/runs") { + if r.URL.Path == "/" || strings.HasPrefix(r.URL.Path, "/sessions") || strings.HasPrefix(r.URL.Path, "/runs") || strings.HasPrefix(r.URL.Path, "/openclaw") || strings.HasPrefix(r.URL.Path, "/agents") { f, err := staticFiles.Open("static/index.html") if err != nil { http.Error(w, "index.html not found", http.StatusInternalServerError) diff --git a/cmd/web-ui/static/app.js b/cmd/web-ui/static/app.js index 1aa36d9..b216f71 100644 --- a/cmd/web-ui/static/app.js +++ b/cmd/web-ui/static/app.js @@ -1,18 +1,91 @@ (function() { const app = document.getElementById('app'); - // Router - function route() { - const path = window.location.pathname; + let ws = null; + let wsReconnectTimeout = null; + const wsCallbacks = new Set(); + let sessionsState = { sessions: [], cursor: null }; + let openclawState = { instances: {} }; + let openclawUnsubscribe = null; + let agentsState = createAgentsState(); + let agentsUnsubscribe = null; + + function getWsURL() { + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; + return protocol + '//' + window.location.host + '/api/v1/ws'; + } + + function connectWS() { + if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) { + return; + } + + try { + ws = new WebSocket(getWsURL()); + + ws.onopen = () => { + console.log('WebSocket connected'); + wsCallbacks.forEach(cb => cb({ type: 'connected' })); + }; + + ws.onmessage = (event) => { + try { + const data = JSON.parse(event.data); + wsCallbacks.forEach(cb => cb({ type: 'message', data })); + } catch (e) { + console.error('Failed to parse WS message:', e); + } + }; + + ws.onclose = () => { + console.log('WebSocket disconnected'); + wsCallbacks.forEach(cb => cb({ type: 'disconnected' })); + wsReconnectTimeout = setTimeout(connectWS, 5000); + }; + + ws.onerror = (err) => { + console.error('WebSocket error:', err); + }; + } catch (e) { + console.error('Failed to connect WebSocket:', e); + wsReconnectTimeout = setTimeout(connectWS, 5000); + } + } + + function subscribeWS(callback) { + wsCallbacks.add(callback); + if (!ws || ws.readyState !== WebSocket.OPEN) { + connectWS(); + } + return () => wsCallbacks.delete(callback); + } + + function cleanupLiveViews() { + if (openclawUnsubscribe) { + openclawUnsubscribe(); + openclawUnsubscribe = null; + } + if (agentsUnsubscribe) { + agentsUnsubscribe(); + agentsUnsubscribe = null; + } + } + + function route() { + cleanupLiveViews(); + + const path = window.location.pathname; if (path === '/' || path === '/sessions') { renderSessions(); + } else if (path.startsWith('/agents')) { + renderAgents(); + } else if (path.startsWith('/openclaw')) { + renderOpenClaw(); } else if (path.startsWith('/sessions/')) { - const sessionID = path.split('/sessions/')[1]; - renderSession(sessionID); + renderSession(path.split('/sessions/')[1]); } else if (path.startsWith('/runs/')) { - const runID = path.split('/runs/')[1]; - renderRun(runID); + renderRun(path.split('/runs/')[1]); } else { app.innerHTML = '

Page not found

'; } @@ -25,14 +98,28 @@ window.addEventListener('popstate', route); - // API helpers async function api(path) { const resp = await fetch('/api' + path); - if (!resp.ok) throw new Error('API error'); + if (!resp.ok) { + throw new Error('API error'); + } return resp.json(); } + function escapeHTML(value) { + return String(value ?? '') + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } + function relativeTime(ts) { + if (!ts) { + return '-'; + } + const now = Date.now(); const then = new Date(ts).getTime(); const diff = now - then; @@ -44,23 +131,82 @@ } function formatDuration(ms) { - if (!ms) return '-'; + if (ms === undefined || ms === null || ms === '') return '-'; if (ms < 1000) return ms + 'ms'; if (ms < 60000) return (ms / 1000).toFixed(1) + 's'; return (ms / 60000).toFixed(1) + 'm'; } - function statusIcon(status) { - if (status === 'success') return ''; - if (status === 'error') return ''; - return ''; + function formatBytes(bytes) { + if (!bytes) return null; + const units = ['B', 'KB', 'MB', 'GB', 'TB']; + let unitIndex = 0; + let value = bytes; + while (value >= 1024 && unitIndex < units.length - 1) { + value /= 1024; + unitIndex++; + } + return value.toFixed(1) + ' ' + units[unitIndex]; } - // Sessions list - let sessionsState = { sessions: [], cursor: null, filters: {} }; + function statusIcon(status) { + if (status === 'success') return 'success'; + if (status === 'error') return 'error'; + return 'unknown'; + } + + function extractEnvelope(record) { + if (record && typeof record === 'object' && record.payload && record.payload.event && record.payload.schema) { + return record.payload; + } + return record || {}; + } + + function getEnvelopeEvent(record) { + const envelope = extractEnvelope(record); + return envelope.event || envelope.Event || {}; + } + + function getEnvelopeType(record) { + return record?.type || getEnvelopeEvent(record).type || ''; + } + + function getEnvelopeTS(record) { + return record?.ts || getEnvelopeEvent(record).ts || ''; + } + + function getEnvelopeSource(record) { + return getEnvelopeEvent(record).source || {}; + } + + function getEnvelopePayload(record) { + const envelope = extractEnvelope(record); + return envelope.payload || envelope.Payload || {}; + } + + function getEnvelopeAttributes(record) { + const envelope = extractEnvelope(record); + return envelope.attributes || envelope.Attributes || {}; + } + + function getEnvelopeCorrelation(record) { + const envelope = extractEnvelope(record); + return envelope.correlation || envelope.Correlation || {}; + } + + function getRecordID(record) { + return record?.event_id || getEnvelopeEvent(record).id || ''; + } + + function isCurrentPath(prefix) { + return window.location.pathname.startsWith(prefix); + } async function renderSessions() { app.innerHTML = ` +
@@ -69,26 +215,28 @@ +
- - - - - - - - - - - -
SessionFrameworkHostRunsTime
+
+ + + + + + + + + + + +
SessionFrameworkHostRunsTime
+
`; - // Bind filter events ['from', 'to', 'framework', 'host'].forEach(f => { document.getElementById('filter-' + f).addEventListener('change', () => { sessionsState.sessions = []; @@ -122,12 +270,12 @@ const tbody = document.getElementById('sessions-body'); tbody.innerHTML = sessionsState.sessions.map(s => ` - - ${s.session_id.substring(0, 12)}... - ${s.framework || '-'} - ${s.host || '-'} + + ${escapeHTML(s.session_id.substring(0, 12))}... + ${escapeHTML(s.framework || '-')} + ${escapeHTML(s.host || '-')} ${s.run_count} - ${relativeTime(s.started_at)} + ${escapeHTML(relativeTime(s.started_at))} `).join('') || 'No sessions found'; @@ -138,12 +286,10 @@ document.getElementById('load-more').style.display = sessionsState.cursor ? 'block' : 'none'; } - // Session detail async function renderSession(sessionID) { const data = await api('/v1/sessions/' + sessionID); const s = data.session; const runs = data.runs || []; - const duration = s.ended_at ? formatDuration(new Date(s.ended_at) - new Date(s.started_at)) : 'ongoing'; @@ -151,42 +297,44 @@ app.innerHTML = ` ← Back to Sessions +
Runs ${runs.length}
+
+ + + + + + + + + + + + ${runs.map(r => { + const runDuration = r.ended_at + ? formatDuration(new Date(r.ended_at) - new Date(r.started_at)) + : '-'; + return ` + + + + + + + + `; + }).join('') || ''} + +
Run IDStatusSpansDurationStarted
${escapeHTML(r.run_id.substring(0, 12))}...${statusIcon(r.status)}${r.span_count}${escapeHTML(runDuration)}${escapeHTML(new Date(r.started_at).toLocaleTimeString())}
No runs
-

Runs (${runs.length})

- - - - - - - - - - - - ${runs.map(r => { - const dur = r.ended_at - ? formatDuration(new Date(r.ended_at) - new Date(r.started_at)) - : '-'; - return ` - - - - - - - - `; - }).join('') || ''} - -
Run IDStatusSpansDurationStarted
${r.run_id.substring(0, 12)}...${statusIcon(r.status)} ${r.status}${r.span_count}${dur}${new Date(r.started_at).toLocaleTimeString()}
No runs
`; document.querySelectorAll('tr.clickable').forEach(row => { @@ -199,51 +347,51 @@ }); } - // Run detail async function renderRun(runID) { const data = await api('/v1/runs/' + runID); const r = data.run; const spans = data.spans || []; - const duration = r.ended_at ? formatDuration(new Date(r.ended_at) - new Date(r.started_at)) : 'ongoing'; app.innerHTML = ` - ← Back to Session + ← Back to Session -

Spans (${spans.length})

- - - - - - - - - - - ${spans.map((sp, i) => ` - - - - - +
Spans ${spans.length}
+
+
NameKindStatusDuration
+ + + + + + - - - - `).join('') || ''} - -
NameKindStatusDuration
No spans
+ + + ${spans.map((sp, i) => ` + + ${escapeHTML(sp.name)} + ${escapeHTML(sp.kind)} + ${statusIcon(sp.status)} + ${escapeHTML(formatDuration(sp.duration_ms))} + + + +
${escapeHTML(JSON.stringify(sp.payload, null, 2))}
+ + + `).join('') || 'No spans'} + + + `; document.querySelectorAll('tr.expandable').forEach(row => { @@ -267,6 +415,497 @@ }); } - // Start + async function renderOpenClaw() { + app.innerHTML = '

Loading...

'; + + openclawUnsubscribe = subscribeWS(handleOpenClawWS); + + try { + const data = await api('/v1/events?event_type=openclaw.snapshot&limit=100'); + mergeOpenClawEvents(data.events || []); + if (isCurrentPath('/openclaw')) { + renderOpenClawGrid(); + } + } catch (e) { + if (isCurrentPath('/openclaw')) { + app.innerHTML = `

Error loading: ${escapeHTML(e.message)}

`; + } + } + } + + function handleOpenClawWS(msg) { + if (msg.type !== 'message') { + return; + } + + if (getEnvelopeType(msg.data) !== 'openclaw.snapshot') { + return; + } + + mergeOpenClawEvents([msg.data]); + + if (isCurrentPath('/openclaw')) { + renderOpenClawGrid(); + } + if (isCurrentPath('/agents')) { + renderAgentVMStrip(); + } + } + + function mergeOpenClawEvents(events) { + for (const evt of events) { + const payload = getEnvelopePayload(evt); + const instance = payload.instance || {}; + if (!instance.name) { + continue; + } + + const existing = openclawState.instances[instance.name]; + const nextTS = new Date(getEnvelopeTS(evt) || 0).getTime(); + const currentTS = existing ? new Date(getEnvelopeTS(existing) || 0).getTime() : 0; + if (!existing || nextTS >= currentTS) { + openclawState.instances[instance.name] = evt; + } + } + } + + function renderOpenClawGrid() { + const names = Object.keys(openclawState.instances).sort(); + + if (names.length === 0) { + app.innerHTML = ` + +

No OpenClaw instances found

+ `; + return; + } + + app.innerHTML = ` + +
+ ${names.map(name => { + const evt = openclawState.instances[name]; + const payload = getEnvelopePayload(evt); + const inst = payload.instance || {}; + const host = payload.host || {}; + const guest = payload.guest; + const issues = payload.issues; + + return ` +
+
+

${escapeHTML(inst.name || name)}

+
+ ${host.state === 'running' ? 'Running' : 'Stopped'} +
+
+
Updated ${escapeHTML(relativeTime(getEnvelopeTS(evt)))}
+ + + + + + + + ${guest ? ` + + + + + + + + ` : ''} +
Host${escapeHTML(inst.host || '-')}
Domain${escapeHTML(inst.domain || '-')}
vCPUs${host.vcpus || '-'}
Memory${escapeHTML(formatBytes(host.memory_kib ? host.memory_kib * 1024 : 0) || '-')}
Disk${escapeHTML(formatBytes(host.disk_actual_bytes) || '-')}
Autostart${host.autostart ? 'Yes' : 'No'}
Gateway${guest.service_active ? 'Active' : 'Inactive'}
HTTP${guest.http_status || 'N/A'}
Version${escapeHTML(guest.version || '-')}
Guest Memory${guest.memory_percent !== undefined ? guest.memory_percent.toFixed(1) : '-'}%
Guest Disk${guest.disk_percent !== undefined ? guest.disk_percent.toFixed(1) : '-'}%
Load${guest.load_average !== undefined ? guest.load_average.toFixed(2) : '-'}
Uptime${escapeHTML(guest.service_uptime || '-')}
+ ${issues ? ` +
+ ${Object.entries(issues).filter(([, value]) => value).map(([key]) => ` + ${escapeHTML(key.replace(/_/g, ' '))} + `).join('')} +
+ ` : ''} +
+ `; + }).join('')} +
+ `; + } + + function createAgentsState() { + return { + events: [], + eventIDs: new Set(), + stats: { + messages: 0, + tools: 0, + errors: 0, + toolCounts: {}, + }, + }; + } + + function getVMStatus() { + const names = ['zap', 'orb', 'sun']; + return names.map(name => { + const snapshot = openclawState.instances[name]; + const payload = snapshot ? getEnvelopePayload(snapshot) : {}; + const host = payload.host || {}; + return { + name, + active: host.state === 'running', + }; + }); + } + + async function renderAgents() { + agentsState = createAgentsState(); + + app.innerHTML = ` + +
+
+
+

Loading agent activity...

+
+
+
+
Messages
+
0
+
received and sent
+
+
+
Tool Calls
+
0
+
+
+
Errors
+
0
+
+
+
Top Tools
+
    +
  • No data yet
  • +
+
+
+
+ `; + + renderAgentVMStrip(); + + try { + const [snapshots, events] = await Promise.all([ + api('/v1/events?event_type=openclaw.snapshot&limit=100').catch(() => ({ events: [] })), + api('/v1/events?framework=openclaw&limit=200'), + ]); + + if (!isCurrentPath('/agents')) { + return; + } + + mergeOpenClawEvents(snapshots.events || []); + renderAgentVMStrip(); + addAgentEvents((events.events || []).slice().reverse()); + renderAgentTimeline(); + renderAgentStats(); + } catch (e) { + const timeline = document.getElementById('agents-timeline'); + if (timeline) { + timeline.innerHTML = `

Error loading agent activity: ${escapeHTML(e.message)}

`; + } + } + + agentsUnsubscribe = subscribeWS(handleAgentsWS); + } + + function renderAgentVMStrip() { + const strip = document.getElementById('agents-vm-strip'); + if (!strip) { + return; + } + + const vms = getVMStatus(); + strip.innerHTML = vms.map(vm => ` +
+ + ${escapeHTML(vm.name)} + ${vm.active ? 'online' : 'offline'} +
+ `).join(''); + } + + function handleAgentsWS(msg) { + if (msg.type !== 'message') { + return; + } + + const eventType = getEnvelopeType(msg.data); + if (eventType === 'openclaw.snapshot') { + mergeOpenClawEvents([msg.data]); + renderAgentVMStrip(); + return; + } + + const framework = getEnvelopeSource(msg.data).framework || msg.data.source_framework; + if (framework !== 'openclaw') { + return; + } + + addAgentEvents([msg.data]); + renderAgentTimeline(); + renderAgentStats(); + } + + function addAgentEvents(events) { + let changed = false; + + for (const evt of events) { + const id = getRecordID(evt); + if (!id || agentsState.eventIDs.has(id)) { + continue; + } + agentsState.eventIDs.add(id); + agentsState.events.push(evt); + changed = true; + } + + if (!changed) { + return; + } + + agentsState.events.sort((a, b) => new Date(getEnvelopeTS(a)).getTime() - new Date(getEnvelopeTS(b)).getTime()); + + while (agentsState.events.length > 500) { + const removed = agentsState.events.shift(); + agentsState.eventIDs.delete(getRecordID(removed)); + } + + recomputeAgentStats(); + } + + function recomputeAgentStats() { + const stats = { + messages: 0, + tools: 0, + errors: 0, + toolCounts: {}, + }; + + for (const evt of agentsState.events) { + const eventType = getEnvelopeType(evt); + const attrs = getEnvelopeAttributes(evt); + + if (eventType === 'run.start' || eventType === 'run.end') { + stats.messages++; + } + + if (eventType === 'span.end' && attrs.span_kind === 'tool') { + stats.tools++; + const toolName = attrs.name || 'unknown'; + stats.toolCounts[toolName] = (stats.toolCounts[toolName] || 0) + 1; + } + + if (eventType === 'error') { + stats.errors++; + } + } + + agentsState.stats = stats; + } + + function getEventIcon(eventType) { + switch (eventType) { + case 'run.start': + return '
'; + case 'run.end': + return '
'; + case 'span.start': + case 'span.end': + return '
'; + case 'error': + return '
!
'; + case 'session.start': + case 'session.end': + return '
'; + default: + return '
·
'; + } + } + + function getEventLabel(eventType) { + const labels = { + 'session.start': 'Session Started', + 'session.end': 'Session Ended', + 'run.start': 'Message Received', + 'run.end': 'Response Sent', + 'span.start': 'Span Started', + 'span.end': 'Span Completed', + 'error': 'Error', + 'metric.snapshot': 'Metric', + }; + return labels[eventType] || eventType; + } + + function getVMName(evt) { + return getEnvelopeSource(evt).client_id || evt.client_id || 'unknown'; + } + + function getVMClassName(vmName) { + const normalized = String(vmName || 'unknown').toLowerCase(); + return ['zap', 'orb', 'sun'].includes(normalized) ? normalized : 'unknown'; + } + + function getEventBody(evt) { + const eventType = getEnvelopeType(evt); + const payload = getEnvelopePayload(evt); + const attrs = getEnvelopeAttributes(evt); + const correlation = getEnvelopeCorrelation(evt); + + if (eventType === 'span.start' || eventType === 'span.end') { + const name = attrs.name || attrs.span_kind || 'unknown span'; + const duration = payload.duration_ms !== undefined && payload.duration_ms !== null + ? ` ${escapeHTML(formatDuration(payload.duration_ms))}` + : ''; + return `
${escapeHTML(name)}${duration}
`; + } + + if (eventType === 'run.start') { + const preview = payload.message_preview || payload.message || ''; + if (!preview) { + return ''; + } + const trimmed = preview.length > 140 ? preview.slice(0, 140) + '...' : preview; + return `
"${escapeHTML(trimmed)}"
`; + } + + if (eventType === 'run.end') { + return `
${statusIcon(payload.status || 'unknown')}
`; + } + + if (eventType === 'error') { + const errPayload = payload.error || {}; + const errType = errPayload.type || 'error'; + const message = errPayload.message || payload.message || 'unknown'; + return `
${escapeHTML(errType + ': ' + message)}
`; + } + + if (eventType === 'session.start' || eventType === 'session.end') { + return correlation.session_id + ? `
session ${escapeHTML(correlation.session_id)}
` + : ''; + } + + return ''; + } + + function getEventDetails(evt) { + const details = {}; + const correlation = getEnvelopeCorrelation(evt); + const attributes = getEnvelopeAttributes(evt); + const payload = getEnvelopePayload(evt); + + if (Object.keys(correlation).length > 0) { + details.correlation = correlation; + } + if (Object.keys(attributes).length > 0) { + details.attributes = attributes; + } + if (Object.keys(payload).length > 0) { + details.payload = payload; + } + + if (Object.keys(details).length === 0) { + return ''; + } + + return JSON.stringify(details, null, 2); + } + + function renderAgentTimeline() { + const timeline = document.getElementById('agents-timeline'); + if (!timeline) { + return; + } + + const recent = agentsState.events.slice(-100).reverse(); + if (recent.length === 0) { + timeline.innerHTML = '

Waiting for agent activity...

'; + return; + } + + timeline.innerHTML = recent.map((evt, index) => { + const eventType = getEnvelopeType(evt); + const vmName = getVMName(evt); + const vmClass = getVMClassName(vmName); + const details = getEventDetails(evt); + const detailHTML = details ? `
${escapeHTML(details)}
` : ''; + const expandHTML = details ? '' : ''; + + return ` +
+
+ ${getEventIcon(eventType)} + ${escapeHTML(vmName)} + ${escapeHTML(getEventLabel(eventType))} + ${escapeHTML(new Date(getEnvelopeTS(evt)).toLocaleTimeString())} +
+ ${getEventBody(evt)} + ${expandHTML} + ${detailHTML} +
+ `; + }).join(''); + + timeline.querySelectorAll('.timeline-expand-hint').forEach(button => { + button.addEventListener('click', () => { + button.parentElement.classList.toggle('expanded'); + }); + }); + } + + function renderAgentStats() { + const stats = agentsState.stats; + + const messagesEl = document.getElementById('stat-messages'); + if (messagesEl) { + messagesEl.textContent = String(stats.messages); + } + + const toolsEl = document.getElementById('stat-tools'); + if (toolsEl) { + toolsEl.textContent = String(stats.tools); + } + + const errorsEl = document.getElementById('stat-errors'); + if (errorsEl) { + errorsEl.textContent = String(stats.errors); + } + + const list = document.getElementById('stat-top-tools'); + if (!list) { + return; + } + + const topTools = Object.entries(stats.toolCounts) + .sort((a, b) => b[1] - a[1]) + .slice(0, 8); + + if (topTools.length === 0) { + list.innerHTML = '
  • No data yet
  • '; + return; + } + + list.innerHTML = topTools.map(([name, count]) => ` +
  • + ${escapeHTML(name)} + ${count} +
  • + `).join(''); + } + route(); })(); diff --git a/cmd/web-ui/static/index.html b/cmd/web-ui/static/index.html index ca2de47..88e388c 100644 --- a/cmd/web-ui/static/index.html +++ b/cmd/web-ui/static/index.html @@ -4,11 +4,17 @@ agentmon + + +
    -

    agentmon

    + +

    Loading...

    diff --git a/cmd/web-ui/static/style.css b/cmd/web-ui/static/style.css index 3fa3c38..96e3087 100644 --- a/cmd/web-ui/static/style.css +++ b/cmd/web-ui/static/style.css @@ -1,70 +1,280 @@ -* { box-sizing: border-box; margin: 0; padding: 0; } +/* ============================================================ + agentmon — Refined Dark UI + ============================================================ */ -body { - font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; - background: #0d1117; - color: #c9d1d9; - line-height: 1.5; +:root { + --bg: #07090f; + --surface: #0d1117; + --surface-2: #121922; + --card: rgba(13, 20, 32, 0.85); + --border: #1c2637; + --border-soft: rgba(28, 38, 55, 0.6); + + --text: #c8d3e0; + --text-dim: #4e6070; + --text-bright: #e8eef4; + + --accent: #22d3ee; + --accent-dim: rgba(34, 211, 238, 0.08); + --accent-glow: rgba(34, 211, 238, 0.2); + + --success: #34d399; + --error: #f87171; + --warning: #fbbf24; + --purple: #a78bfa; + + --font-display: 'Syne', sans-serif; + --font-body: 'Outfit', sans-serif; + --font-mono: 'Fira Code', monospace; + + --radius: 8px; + --radius-lg: 12px; + --radius-xl: 16px; } +/* ── Reset ─────────────────────────────────────────────────── */ +*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } + +/* ── Base ──────────────────────────────────────────────────── */ +html { scroll-behavior: smooth; } + +body { + font-family: var(--font-body); + font-size: 15px; + background-color: var(--bg); + background-image: + radial-gradient(ellipse 80% 40% at 50% -20%, rgba(34, 211, 238, 0.04) 0%, transparent 70%), + radial-gradient(circle at 1px 1px, rgba(34, 211, 238, 0.045) 1px, transparent 0); + background-size: 100% 100%, 28px 28px; + color: var(--text); + line-height: 1.6; + min-height: 100vh; + -webkit-font-smoothing: antialiased; +} + +/* ── Header ────────────────────────────────────────────────── */ header { - background: #161b22; - padding: 1rem 2rem; - border-bottom: 1px solid #30363d; + position: sticky; + top: 0; + z-index: 100; + display: flex; + align-items: center; + justify-content: space-between; + padding: 0 2rem; + height: 54px; + background: rgba(7, 9, 15, 0.82); + backdrop-filter: blur(16px); + -webkit-backdrop-filter: blur(16px); + border-bottom: 1px solid var(--border); +} + +header::after { + content: ''; + position: absolute; + bottom: -1px; + left: 0; + right: 0; + height: 1px; + background: linear-gradient(90deg, transparent 0%, var(--accent) 40%, var(--accent) 60%, transparent 100%); + opacity: 0.15; + pointer-events: none; +} + +.header-logo { + display: flex; + align-items: center; + gap: 0.5rem; } header h1 a { - color: #58a6ff; + font-family: var(--font-display); + font-size: 1rem; + font-weight: 800; + color: var(--text-bright); text-decoration: none; + letter-spacing: 0.08em; + text-transform: uppercase; } -main { - max-width: 1200px; - margin: 0 auto; - padding: 2rem; -} - -.back-link { +.logo-dot { display: inline-block; - margin-bottom: 1rem; - color: #58a6ff; - text-decoration: none; + width: 6px; + height: 6px; + border-radius: 50%; + background: var(--accent); + box-shadow: 0 0 8px var(--accent-glow); + margin-left: 2px; + vertical-align: middle; + position: relative; + top: -1px; } -.back-link:hover { text-decoration: underline; } +header nav { + display: flex; + align-items: center; + gap: 0.25rem; +} +header nav a { + font-size: 0.8rem; + font-weight: 500; + color: var(--text-dim); + text-decoration: none; + padding: 0.375rem 0.875rem; + border-radius: var(--radius); + letter-spacing: 0.04em; + transition: color 0.15s, background 0.15s; +} + +header nav a:hover { + color: var(--text-bright); + background: var(--surface-2); +} + +/* ── Main ──────────────────────────────────────────────────── */ +main { + max-width: 1240px; + margin: 0 auto; + padding: 2.5rem 2rem; +} + +/* Page entry animation */ +@keyframes fadeUp { + from { opacity: 0; transform: translateY(10px); } + to { opacity: 1; transform: translateY(0); } +} + +main > * { + animation: fadeUp 0.28s ease both; +} + +/* ── Back link ─────────────────────────────────────────────── */ +.back-link { + display: inline-flex; + align-items: center; + gap: 0.4rem; + margin-bottom: 1.75rem; + color: var(--text-dim); + text-decoration: none; + font-size: 0.8rem; + font-weight: 500; + letter-spacing: 0.03em; + transition: color 0.15s; +} + +.back-link:hover { color: var(--accent); } + +/* ── Page header ───────────────────────────────────────────── */ .page-header { - margin-bottom: 1.5rem; + margin-bottom: 2rem; + padding-bottom: 1.5rem; + border-bottom: 1px solid var(--border-soft); } .page-header h2 { - font-size: 1.5rem; - margin-bottom: 0.5rem; + font-family: var(--font-display); + font-size: 1.55rem; + font-weight: 700; + color: var(--text-bright); + margin-bottom: 0.6rem; + letter-spacing: -0.02em; } -.meta { color: #8b949e; font-size: 0.9rem; } +.meta { + display: flex; + flex-wrap: wrap; + gap: 0.4rem 1.25rem; + color: var(--text-dim); + font-size: 0.8rem; +} +.meta-item { + display: flex; + align-items: center; + gap: 0.3rem; +} + +.meta-label { + font-size: 0.72rem; + text-transform: uppercase; + letter-spacing: 0.06em; +} + +/* ── Section title ─────────────────────────────────────────── */ +.section-title { + font-family: var(--font-display); + font-size: 1rem; + font-weight: 700; + color: var(--text-bright); + margin-bottom: 1rem; + letter-spacing: 0.01em; + display: flex; + align-items: center; + gap: 0.6rem; +} + +.section-title .count { + font-family: var(--font-mono); + font-size: 0.72rem; + font-weight: 500; + color: var(--text-dim); + background: var(--surface-2); + border: 1px solid var(--border); + padding: 0.1rem 0.45rem; + border-radius: 999px; + letter-spacing: 0.04em; +} + +/* ── Filters ───────────────────────────────────────────────── */ .filters { display: flex; - gap: 1rem; + gap: 0.75rem; margin-bottom: 1.5rem; flex-wrap: wrap; + align-items: flex-end; } .filters label { display: flex; flex-direction: column; - gap: 0.25rem; - font-size: 0.85rem; - color: #8b949e; + gap: 0.35rem; + font-size: 0.7rem; + font-weight: 600; + color: var(--text-dim); + text-transform: uppercase; + letter-spacing: 0.08em; } -.filters input, .filters select { - background: #21262d; - border: 1px solid #30363d; - color: #c9d1d9; - padding: 0.5rem; - border-radius: 4px; +.filters input, +.filters select { + background: var(--surface); + border: 1px solid var(--border); + color: var(--text); + padding: 0.45rem 0.75rem; + border-radius: var(--radius); + font-family: var(--font-body); + font-size: 0.85rem; + transition: border-color 0.15s, box-shadow 0.15s; + outline: none; + min-width: 140px; +} + +.filters input:focus, +.filters select:focus { + border-color: var(--accent); + box-shadow: 0 0 0 3px var(--accent-dim); +} + +.filters select option { + background: var(--surface-2); +} + +/* ── Table container ───────────────────────────────────────── */ +.table-container { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius-lg); + overflow: hidden; } table { @@ -74,55 +284,654 @@ table { th, td { text-align: left; - padding: 0.75rem 1rem; - border-bottom: 1px solid #21262d; + padding: 0.7rem 1.25rem; } th { - background: #161b22; - font-weight: 600; - font-size: 0.85rem; + background: var(--surface-2); + font-size: 0.68rem; + font-weight: 700; text-transform: uppercase; - color: #8b949e; + letter-spacing: 0.1em; + color: var(--text-dim); + border-bottom: 1px solid var(--border); + white-space: nowrap; } -tr:hover { background: #161b22; } +td { + font-size: 0.875rem; + border-bottom: 1px solid var(--border-soft); + color: var(--text); +} + +tr:last-child td { border-bottom: none; } tr.clickable { cursor: pointer; } -.status-success { color: #3fb950; } -.status-error { color: #f85149; } -.status-unknown { color: #d29922; } +tr.clickable:hover td { + background: var(--surface-2); + color: var(--text-bright); +} +tr.clickable:hover td:first-child { + border-left: 2px solid var(--accent); + padding-left: calc(1.25rem - 2px); +} + +/* ── Status badges ─────────────────────────────────────────── */ +.status-badge { + display: inline-flex; + align-items: center; + gap: 0.35rem; + padding: 0.2rem 0.6rem; + border-radius: 999px; + font-size: 0.72rem; + font-weight: 600; + letter-spacing: 0.04em; + white-space: nowrap; +} + +.status-dot { + width: 5px; + height: 5px; + border-radius: 50%; + flex-shrink: 0; +} + +.status-success { + color: var(--success); + background: rgba(52, 211, 153, 0.1); + border: 1px solid rgba(52, 211, 153, 0.2); +} +.status-success .status-dot { background: var(--success); } + +.status-error { + color: var(--error); + background: rgba(248, 113, 113, 0.1); + border: 1px solid rgba(248, 113, 113, 0.2); +} +.status-error .status-dot { background: var(--error); } + +.status-unknown { + color: var(--warning); + background: rgba(251, 191, 36, 0.1); + border: 1px solid rgba(251, 191, 36, 0.2); +} +.status-unknown .status-dot { background: var(--warning); } + +/* ── Monospace cells ───────────────────────────────────────── */ +.id-cell { + font-family: var(--font-mono); + font-size: 0.78rem; + color: var(--accent); + letter-spacing: 0.02em; +} + +/* ── Load more ─────────────────────────────────────────────── */ .load-more { display: block; width: 100%; - margin-top: 1rem; - padding: 0.75rem; - background: #21262d; - border: 1px solid #30363d; - color: #c9d1d9; + margin-top: 0.875rem; + padding: 0.7rem; + background: transparent; + border: 1px dashed var(--border); + color: var(--text-dim); cursor: pointer; - border-radius: 4px; + border-radius: var(--radius); + font-family: var(--font-body); + font-size: 0.8rem; + font-weight: 500; + letter-spacing: 0.05em; + text-transform: uppercase; + transition: border-color 0.15s, color 0.15s, background 0.15s; } -.load-more:hover { background: #30363d; } +.load-more:hover { + border-color: var(--accent); + color: var(--accent); + background: var(--accent-dim); +} +/* ── Span expand ───────────────────────────────────────────── */ .expandable { cursor: pointer; } -.expand-icon { margin-right: 0.5rem; } + +.expand-icon { + display: inline-flex; + align-items: center; + justify-content: center; + width: 16px; + height: 16px; + margin-right: 0.5rem; + color: var(--text-dim); + font-size: 0.6rem; + transition: transform 0.18s ease; +} + .span-details { - background: #161b22; - padding: 1rem; - margin: 0.5rem 0; - border-radius: 4px; - font-family: monospace; - font-size: 0.85rem; + background: #020508; + padding: 1.25rem; + font-family: var(--font-mono); + font-size: 0.78rem; white-space: pre-wrap; word-break: break-all; + color: #7a9ab5; + line-height: 1.75; + border-top: 1px solid var(--border); } +/* ── Empty state ───────────────────────────────────────────── */ .empty-state { text-align: center; - padding: 3rem; - color: #8b949e; + padding: 4rem 2rem; + color: var(--text-dim); + font-size: 0.875rem; + letter-spacing: 0.02em; +} + +/* ── Live indicator ────────────────────────────────────────── */ +.live-indicator { + display: inline-flex; + align-items: center; + gap: 0.4rem; + font-size: 0.68rem; + font-weight: 700; + letter-spacing: 0.1em; + text-transform: uppercase; + color: var(--success); +} + +.live-dot { + width: 6px; + height: 6px; + border-radius: 50%; + background: var(--success); + box-shadow: 0 0 6px rgba(52, 211, 153, 0.5); + animation: livePulse 2s ease-in-out infinite; +} + +@keyframes livePulse { + 0%, 100% { opacity: 1; box-shadow: 0 0 6px rgba(52, 211, 153, 0.5); } + 50% { opacity: 0.6; box-shadow: 0 0 2px rgba(52, 211, 153, 0.2); } +} + +/* ── VM Grid ───────────────────────────────────────────────── */ +.vm-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(360px, 1fr)); + gap: 1.25rem; +} + +/* ── VM Card ───────────────────────────────────────────────── */ +.vm-card { + background: var(--card); + border: 1px solid var(--border); + border-radius: var(--radius-xl); + padding: 1.25rem; + backdrop-filter: blur(10px); + -webkit-backdrop-filter: blur(10px); + transition: border-color 0.2s, transform 0.2s; + position: relative; + overflow: hidden; +} + +.vm-card::before { + content: ''; + position: absolute; + top: 0; left: 0; right: 0; + height: 1px; + background: linear-gradient(90deg, transparent, var(--accent-glow), transparent); + opacity: 0; + transition: opacity 0.2s; +} + +.vm-card:hover { + border-color: rgba(34, 211, 238, 0.18); + transform: translateY(-2px); +} + +.vm-card:hover::before { opacity: 1; } + +.vm-card-header { + display: flex; + align-items: flex-start; + justify-content: space-between; + margin-bottom: 0.875rem; +} + +.vm-card h3 { + font-family: var(--font-display); + font-size: 0.95rem; + font-weight: 700; + color: var(--text-bright); + letter-spacing: 0.03em; +} + +.vm-status { + display: inline-flex; + align-items: center; + gap: 0.35rem; + padding: 0.2rem 0.65rem; + border-radius: 999px; + font-size: 0.68rem; + font-weight: 700; + letter-spacing: 0.08em; + text-transform: uppercase; +} + +.vm-status.running { + background: rgba(52, 211, 153, 0.1); + color: var(--success); + border: 1px solid rgba(52, 211, 153, 0.2); +} + +.vm-status.stopped { + background: rgba(248, 113, 113, 0.1); + color: var(--error); + border: 1px solid rgba(248, 113, 113, 0.2); +} + +.vm-updated { + font-size: 0.7rem; + color: var(--text-dim); + margin-bottom: 0.75rem; + font-family: var(--font-mono); + letter-spacing: 0.02em; +} + +.vm-divider { + height: 1px; + background: var(--border-soft); + margin: 0.875rem 0; +} + +.vm-stats { width: 100%; } + +.vm-stats td { + padding: 0.28rem 0; + border-bottom: none; + font-size: 0.82rem; +} + +.vm-stats td:first-child { + color: var(--text-dim); + font-size: 0.7rem; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.07em; + width: 42%; +} + +.vm-stats td:last-child { + font-family: var(--font-mono); + font-size: 0.78rem; + color: var(--text); +} + +.vm-issues { + display: flex; + flex-wrap: wrap; + gap: 0.4rem; + margin-top: 0.875rem; +} + +.issue { + font-size: 0.68rem; + padding: 0.22rem 0.6rem; + border-radius: 4px; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 0.06em; +} + +.issue.gateway_down { + background: rgba(248, 113, 113, 0.12); + color: var(--error); + border: 1px solid rgba(248, 113, 113, 0.2); +} + +.issue.http_unhealthy { + background: rgba(251, 191, 36, 0.1); + color: var(--warning); + border: 1px solid rgba(251, 191, 36, 0.2); +} + +.issue.backup_stale { + background: rgba(251, 191, 36, 0.08); + color: var(--warning); + border: 1px solid rgba(251, 191, 36, 0.15); +} + +.issue.version_mismatch { + background: rgba(167, 139, 250, 0.1); + color: var(--purple); + border: 1px solid rgba(167, 139, 250, 0.2); +} + +/* ── Agents Page ───────────────────────────────────────────── */ +.agents-layout { + display: grid; + grid-template-columns: minmax(0, 1fr) 280px; + gap: 1.5rem; + margin-top: 1.25rem; +} + +@media (max-width: 900px) { + .agents-layout { + grid-template-columns: 1fr; + } +} + +.vm-strip { + display: flex; + flex-wrap: wrap; + gap: 0.75rem; + margin-bottom: 1.5rem; +} + +.vm-pill { + display: inline-flex; + align-items: center; + gap: 0.5rem; + padding: 0.5rem 1rem; + background: var(--surface); + border: 1px solid var(--border); + border-radius: 999px; + font-size: 0.78rem; + font-weight: 600; + letter-spacing: 0.04em; + transition: border-color 0.2s, opacity 0.2s; +} + +.vm-pill.active { + border-color: rgba(52, 211, 153, 0.3); +} + +.vm-pill.inactive { + border-color: rgba(248, 113, 113, 0.2); + opacity: 0.6; +} + +.vm-pill-dot { + width: 7px; + height: 7px; + border-radius: 50%; + flex-shrink: 0; +} + +.vm-pill.active .vm-pill-dot { + background: var(--success); + box-shadow: 0 0 6px rgba(52, 211, 153, 0.5); + animation: livePulse 2s ease-in-out infinite; +} + +.vm-pill.inactive .vm-pill-dot { + background: var(--error); +} + +.vm-pill-name { + font-family: var(--font-mono); + color: var(--text-bright); +} + +.vm-pill-label { + color: var(--text-dim); + font-size: 0.68rem; + text-transform: uppercase; + letter-spacing: 0.06em; +} + +.timeline { + display: flex; + flex-direction: column; + gap: 0.5rem; + min-width: 0; +} + +.timeline-event { + background: var(--card); + border: 1px solid var(--border); + border-radius: var(--radius-lg); + padding: 0.875rem 1.125rem; + backdrop-filter: blur(8px); + -webkit-backdrop-filter: blur(8px); + animation: fadeUp 0.25s ease both; + transition: border-color 0.15s; +} + +.timeline-event:hover { + border-color: rgba(34, 211, 238, 0.15); +} + +.timeline-event-header { + display: flex; + align-items: center; + gap: 0.6rem; + margin-bottom: 0.35rem; +} + +.timeline-vm-tag { + font-family: var(--font-mono); + font-size: 0.68rem; + font-weight: 700; + padding: 0.15rem 0.5rem; + border-radius: 4px; + letter-spacing: 0.05em; + text-transform: uppercase; +} + +.timeline-vm-tag.zap { + background: rgba(34, 211, 238, 0.12); + color: var(--accent); + border: 1px solid rgba(34, 211, 238, 0.2); +} + +.timeline-vm-tag.orb { + background: rgba(167, 139, 250, 0.12); + color: var(--purple); + border: 1px solid rgba(167, 139, 250, 0.2); +} + +.timeline-vm-tag.sun { + background: rgba(251, 191, 36, 0.12); + color: var(--warning); + border: 1px solid rgba(251, 191, 36, 0.2); +} + +.timeline-vm-tag.unknown { + background: var(--surface-2); + color: var(--text-dim); + border: 1px solid var(--border); +} + +.timeline-event-type { + font-size: 0.75rem; + font-weight: 600; + color: var(--text-bright); +} + +.timeline-event-time { + font-family: var(--font-mono); + font-size: 0.68rem; + color: var(--text-dim); + margin-left: auto; +} + +.timeline-event-body { + font-size: 0.82rem; + color: var(--text); + line-height: 1.5; + padding-left: 0.15rem; +} + +.timeline-event-body.tool-name { + font-family: var(--font-mono); + color: var(--accent); + font-size: 0.78rem; +} + +.timeline-event-body.message-preview { + color: var(--text-dim); + font-style: italic; +} + +.timeline-event-body.error-message { + color: var(--error); +} + +.timeline-duration { + font-family: var(--font-mono); + font-size: 0.72rem; + color: var(--text-dim); + margin-left: 0.5rem; +} + +.timeline-detail { + margin-top: 0.5rem; + padding: 0.75rem; + background: #020508; + border-radius: var(--radius); + font-family: var(--font-mono); + font-size: 0.75rem; + color: #7a9ab5; + white-space: pre-wrap; + word-break: break-all; + line-height: 1.65; + display: none; +} + +.timeline-event.expanded .timeline-detail { + display: block; +} + +.timeline-expand-hint { + display: inline-flex; + align-items: center; + margin-top: 0.3rem; + padding: 0; + background: transparent; + border: none; + color: var(--text-dim); + cursor: pointer; + font-family: var(--font-body); + font-size: 0.68rem; + letter-spacing: 0.03em; +} + +.timeline-expand-hint:hover { + color: var(--accent); +} + +.stats-panel { + display: flex; + flex-direction: column; + gap: 1rem; +} + +.stat-card { + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius-lg); + padding: 1rem; +} + +.stat-card-title { + font-size: 0.68rem; + font-weight: 700; + color: var(--text-dim); + text-transform: uppercase; + letter-spacing: 0.1em; + margin-bottom: 0.6rem; +} + +.stat-card-value { + font-family: var(--font-display); + font-size: 1.6rem; + font-weight: 800; + color: var(--text-bright); + letter-spacing: -0.02em; +} + +.stat-card-sub { + font-size: 0.72rem; + color: var(--text-dim); + margin-top: 0.1rem; +} + +.stat-list { + list-style: none; +} + +.stat-list li { + display: flex; + justify-content: space-between; + align-items: center; + padding: 0.35rem 0; + border-bottom: 1px solid var(--border-soft); + font-size: 0.8rem; +} + +.stat-list li:last-child { + border-bottom: none; +} + +.stat-list-name { + font-family: var(--font-mono); + font-size: 0.75rem; + color: var(--text); +} + +.stat-list-count { + font-family: var(--font-mono); + font-size: 0.72rem; + color: var(--text-dim); + background: var(--surface-2); + padding: 0.1rem 0.4rem; + border-radius: 4px; +} + +.event-icon { + width: 18px; + height: 18px; + border-radius: 4px; + display: flex; + align-items: center; + justify-content: center; + font-size: 0.6rem; + flex-shrink: 0; +} + +.event-icon.message-in { + background: rgba(52, 211, 153, 0.12); + color: var(--success); + border: 1px solid rgba(52, 211, 153, 0.25); +} + +.event-icon.message-out { + background: rgba(34, 211, 238, 0.12); + color: var(--accent); + border: 1px solid rgba(34, 211, 238, 0.25); +} + +.event-icon.tool { + background: rgba(167, 139, 250, 0.12); + color: var(--purple); + border: 1px solid rgba(167, 139, 250, 0.25); +} + +.event-icon.error { + background: rgba(248, 113, 113, 0.12); + color: var(--error); + border: 1px solid rgba(248, 113, 113, 0.25); +} + +.event-icon.session { + background: rgba(251, 191, 36, 0.12); + color: var(--warning); + border: 1px solid rgba(251, 191, 36, 0.25); +} + +.event-icon.internal { + background: var(--surface-2); + color: var(--text-dim); + border: 1px solid var(--border); } diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..040ac67 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,110 @@ +services: + postgres: + image: postgres:16 + container_name: agentmon-db + environment: + POSTGRES_PASSWORD: pass + POSTGRES_DB: agentmon + ports: + - "5432:5432" + volumes: + - postgres-data:/var/lib/postgresql/data + - ./deploy/k8s/postgres.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + + nats: + image: nats:latest + container_name: agentmon-nats + ports: + - "4222:4222" + command: "--jetstream" + volumes: + - nats-data:/data + + ingest-gateway: + build: . + container_name: agentmon-ingest + command: ingest-gateway + ports: + - "8080:8080" + depends_on: + nats: + condition: service_started + environment: + AGENTMON_ADDR: :8080 + NATS_URL: nats://nats:4222 + NATS_TOPIC: agentmon.events.v1 + + query-api: + build: . + container_name: agentmon-query + command: query-api + ports: + - "8081:8081" + depends_on: + nats: + condition: service_started + postgres: + condition: service_healthy + environment: + AGENTMON_QUERY_ADDR: :8081 + DATABASE_URL: postgres://postgres:pass@postgres:5432/agentmon?sslmode=disable + AGENTMON_QUERY_BASE: http://localhost:8081 + NATS_URL: nats://nats:4222 + NATS_TOPIC: agentmon.events.v1 + + web-ui: + build: . + container_name: agentmon-ui + command: web-ui + ports: + - "8082:8082" + depends_on: + query-api: + condition: service_started + environment: + AGENTMON_UI_ADDR: :8082 + AGENTMON_QUERY_BASE: http://query-api:8081 + + event-processor: + build: . + container_name: agentmon-processor + command: event-processor + depends_on: + postgres: + condition: service_healthy + nats: + condition: service_started + environment: + DATABASE_URL: postgres://postgres:pass@postgres:5432/agentmon?sslmode=disable + NATS_URL: nats://nats:4222 + NATS_TOPIC: agentmon.events.v1 + + openclaw-monitor: + build: . + container_name: agentmon-openclaw-monitor + command: openclaw-monitor + network_mode: host + depends_on: + nats: + condition: service_started + environment: + NATS_URL: nats://localhost:4222 + NATS_TOPIC: agentmon.events.v1 + OPENCLAW_REGISTRY: /openclaw-registry/openclaw-instances.json + POLL_INTERVAL: 30s + volumes: + - /home/will/.claude/state/openclaw-instances.json:/openclaw-registry/openclaw-instances.json:ro + - /var/run/libvirt/libvirt-sock:/var/run/libvirt/libvirt-sock + - /home/will/.ssh/id_rsa:/root/.ssh/id_rsa:ro + - /home/will/.ssh/id_rsa.pub:/root/.ssh/id_rsa.pub:ro + - /home/will/.ssh/authorized_keys:/root/.ssh/authorized_keys:ro + - /var/lib/libvirt:/var/lib/libvirt:ro + +volumes: + postgres-data: + nats-data: diff --git a/docs/plans/2026-03-13-agent-monitoring-design.md b/docs/plans/2026-03-13-agent-monitoring-design.md new file mode 100644 index 0000000..f4a6051 --- /dev/null +++ b/docs/plans/2026-03-13-agent-monitoring-design.md @@ -0,0 +1,147 @@ +# Agent Activity Monitoring via OpenClaw Hooks + +**Date:** 2026-03-13 +**Status:** Approved + +## Goal + +Monitor all OpenClaw agent and subagent activity across the three VMs (zap, orb, sun) — tool calls, conversation flow, token usage, session lifecycle, and errors — and display it in a real-time dashboard in the agentmon web UI. + +## Architecture + +``` +┌─────────────────────────────────────────────────────┐ +│ VM (zap / orb / sun) │ +│ │ +│ OpenClaw Gateway │ +│ ├── agent loop (messages, tools, sessions) │ +│ └── agentmon-hook (TypeScript) │ +│ │ listens to: message:received/sent, │ +│ │ tool_result_persist, command:*, session:* │ +│ │ │ +│ └──── POST /v1/events ─────────────────┐ │ +│ │ │ +└──────────────────────────────────────────────────│───┘ + │ + ▼ +┌──────────────────────────────────────────────────────┐ +│ Host │ +│ agentmon ingest-gateway (:8080) │ +│ → NATS → event-processor → Postgres │ +│ → query-api → web-ui (new "Agents" page) │ +└──────────────────────────────────────────────────────┘ +``` + +One hook deployed to all three VMs captures everything and ships it to the existing agentmon pipeline. No changes needed to ingest, NATS, or storage. + +## Event Mapping + +| OpenClaw Event | agentmon Event | What it captures | +|---|---|---| +| `command:new` | `session.start` | Agent session begins | +| `command:stop` / `command:reset` | `session.end` | Session ends | +| `message:received` | `run.start` | Inbound message starts a turn | +| `message:sent` | `run.end` | Agent response completes the turn | +| `tool_result_persist` | `span.start` + `span.end` | Tool call with result | +| `session:compact:before/after` | `span` (kind: `internal`) | Context window management | + +### Correlation + +- `session_id` = OpenClaw `sessionKey` +- `run_id` = generated UUID per inbound message, carried through to `message:sent` +- `framework` = `"openclaw"` +- `client_id` = VM name (zap / orb / sun) + +Token usage and cost attached via `WithLLMUsage` attributes on `run.end` events if the `message:sent` payload includes usage metadata. + +## Hook Design + +### Directory Structure + +``` +~/.openclaw/hooks/agentmon/ +├── HOOK.md # metadata: events, requirements +├── handler.ts # event capture + HTTP emit +└── package.json # minimal deps +``` + +### Deployment + +SCP the directory to each VM. The hook auto-discovers via OpenClaw's hook loading — no config changes needed beyond having hooks enabled. + +The hook POSTs to the host machine's ingest gateway. VMs are on the libvirt bridge (192.168.122.x), so the gateway URL is configured as an env var or uses the host's bridge IP. + +### Resilience + +- Fire-and-forget with a small in-memory buffer (batch up to 10 events or 2s, whichever comes first) +- 500ms timeout on fetch calls — if agentmon is slow, skip and move on +- Events that fail to send are logged locally but not retried +- The hook must never slow down the OpenClaw agent loop + +## Error Handling + +### In the hook + +- All HTTP POSTs wrapped in try/catch — never throw, never block +- Malformed event payloads (missing sessionKey, etc.) silently dropped with debug log + +### In the pipeline + +- Ingest gateway deduplicates by event ID — safe if a hook sends twice +- Events with `framework: "openclaw"` but missing correlation IDs get stored but won't appear in the agents timeline + +### Edge cases + +- VM reboots mid-session: no `session.end` emitted — UI shows session as "ongoing" until a new `command:new` arrives +- OpenClaw compacts context before hook fires: `session:compact:after` still fires, captured as internal span +- Network partition between VM and host: events silently lost, no backfill — acceptable for monitoring + +## UI — Agents Page + +### Layout + +A live activity dashboard at `/agents` with three sections: + +1. **Top strip**: Three VM pill indicators (zap / orb / sun) showing online/offline with a subtle pulse when active +2. **Activity timeline**: Vertical feed of events across all agents — messages, tool calls, errors — with VM name color-coded, monospace timestamps, and collapsible tool call detail rows. Real-time via existing WebSocket. +3. **Side stats panel**: Aggregate metrics — messages/hour, tool calls today, error rate, most-used tools + +### Aesthetic + +Matches the refined dark theme already in place: +- Timeline cards with glassmorphism +- Color-coded VM badges +- Monospace timestamps (Fira Code) +- Syne display font for headings +- Fade-in animations on new events +- Status pill badges consistent with existing design system + +## Implementation Plan + +### Phase 1: OpenClaw Hook + +1. Create hook directory structure (`HOOK.md`, `handler.ts`) +2. Implement event-to-agentmon mapper — translate each OpenClaw event type to the agentmon envelope schema +3. HTTP emitter with buffering (batch up to 10 events or 2s, whichever first) and 500ms timeout +4. Unit test the mapper logic locally + +### Phase 2: Agentmon UI — Agents Page + +5. Add `/agents` route to the SPA router in `app.js` +6. Add "Agents" nav link in header +7. Build the top strip — three VM status pills pulling from existing `openclaw.snapshot` data +8. Build the activity timeline — subscribe to WebSocket, filter for `framework: "openclaw"` events, render as vertical feed with collapsible tool call details +9. Build the side stats panel — aggregate counts from the query API (messages/hour, tool calls, error rate, top tools) +10. Style with the refined dark aesthetic — glassmorphism timeline cards, color-coded VM badges, monospace timestamps, fade-in animations + +### Phase 3: Deploy + +11. SCP hook to all three VMs, verify auto-discovery +12. Send a test message to one agent, confirm events flow end-to-end + +## Not in Scope (Future) + +- Token/cost dashboard (needs usage data verification in `message:sent` payloads) +- Historical analytics and aggregation queries +- Hook auto-deployment via openclaw-monitor +- Alerting on error rate spikes diff --git a/docs/plans/2026-03-13-agent-monitoring-plan.md b/docs/plans/2026-03-13-agent-monitoring-plan.md new file mode 100644 index 0000000..ace206b --- /dev/null +++ b/docs/plans/2026-03-13-agent-monitoring-plan.md @@ -0,0 +1,1234 @@ +# Agent Activity Monitoring Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Monitor all OpenClaw agent activity (tool calls, messages, sessions, errors) across three VMs and display it in a real-time dashboard. + +**Architecture:** An OpenClaw hook (TypeScript) on each VM captures agent events and POSTs them as agentmon envelopes to the ingest gateway. A new `/agents` page in the web UI renders a live activity timeline via WebSocket. A small backend addition filters events by framework. + +**Tech Stack:** TypeScript (hook), Go (backend filter), Vanilla JS/CSS (UI) + +**Design doc:** `docs/plans/2026-03-13-agent-monitoring-design.md` + +--- + +## Task 1: Add framework filter to events query + +**Files:** +- Modify: `internal/store/postgres/query.go` +- Modify: `cmd/query-api/main.go` + +The agents UI needs to load recent events filtered by `source_framework = 'openclaw'`. The existing `ListRecentEvents` only takes `limit`. + +**Step 1: Add EventsFilter struct and update ListRecentEvents** + +In `internal/store/postgres/query.go`, replace the current function: + +```go +type EventsFilter struct { + Limit int + EventType string + Framework string +} + +func (d *DB) ListRecentEvents(ctx context.Context, f EventsFilter) ([]EventRow, error) { + if f.Limit <= 0 { + f.Limit = 100 + } + if f.Limit > 1000 { + f.Limit = 1000 + } + + query := "SELECT event_id, ts, type, payload FROM events WHERE 1=1" + args := []any{} + argN := 1 + + if f.EventType != "" { + query += fmt.Sprintf(" AND type = $%d", argN) + args = append(args, f.EventType) + argN++ + } + if f.Framework != "" { + query += fmt.Sprintf(" AND source_framework = $%d", argN) + args = append(args, f.Framework) + argN++ + } + + query += fmt.Sprintf(" ORDER BY ts DESC LIMIT $%d", argN) + args = append(args, f.Limit) + + rows, err := d.sql.QueryContext(ctx, query, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + var out []EventRow + for rows.Next() { + var r EventRow + if err := rows.Scan(&r.EventID, &r.TS, &r.Type, &r.Payload); err != nil { + return nil, err + } + out = append(out, r) + } + return out, rows.Err() +} +``` + +Add `"fmt"` to the import block. + +**Step 2: Update the query-api handler to pass filters** + +In `cmd/query-api/main.go`, update the `/v1/events` handler (around line 113): + +```go +r.Get("/v1/events", func(w http.ResponseWriter, r *http.Request) { + limit, _ := strconv.Atoi(r.URL.Query().Get("limit")) + f := postgres.EventsFilter{ + Limit: limit, + EventType: r.URL.Query().Get("event_type"), + Framework: r.URL.Query().Get("framework"), + } + events, err := db.ListRecentEvents(r.Context(), f) + if err != nil { + httpx.WriteJSON(w, http.StatusInternalServerError, map[string]any{"error": "db_error"}) + return + } + httpx.WriteJSON(w, http.StatusOK, map[string]any{"events": events}) +}) +``` + +**Step 3: Verify it compiles** + +Run: `cd /home/will/lab/agentmon && go build ./...` +Expected: No errors + +**Step 4: Test the filter with curl** + +Run: `curl -s 'http://localhost:8081/v1/events?framework=openclaw&limit=5'` +Expected: `{"events":null}` or `{"events":[]}` (no openclaw events yet, but no errors) + +**Step 5: Commit** + +```bash +git add internal/store/postgres/query.go cmd/query-api/main.go +git commit -m "feat: add event_type and framework filters to events endpoint" +``` + +--- + +## Task 2: Add /agents SPA route to Go server + +**Files:** +- Modify: `cmd/web-ui/main.go:51` + +**Step 1: Add /agents to the SPA catch-all** + +In `cmd/web-ui/main.go`, update line 51 to include `/agents`: + +Change: +```go +if r.URL.Path == "/" || strings.HasPrefix(r.URL.Path, "/sessions") || strings.HasPrefix(r.URL.Path, "/runs") || strings.HasPrefix(r.URL.Path, "/openclaw") { +``` + +To: +```go +if r.URL.Path == "/" || strings.HasPrefix(r.URL.Path, "/sessions") || strings.HasPrefix(r.URL.Path, "/runs") || strings.HasPrefix(r.URL.Path, "/openclaw") || strings.HasPrefix(r.URL.Path, "/agents") { +``` + +**Step 2: Verify it compiles** + +Run: `cd /home/will/lab/agentmon && go build ./cmd/web-ui/` +Expected: No errors + +**Step 3: Commit** + +```bash +git add cmd/web-ui/main.go +git commit -m "feat: add /agents SPA route to web-ui server" +``` + +--- + +## Task 3: Add Agents nav link and route in app.js + +**Files:** +- Modify: `cmd/web-ui/static/index.html` +- Modify: `cmd/web-ui/static/app.js` + +**Step 1: Add "Agents" nav link in index.html** + +In `cmd/web-ui/static/index.html`, update the `