feat(query-api): add richer stats and retention

This commit is contained in:
William Valentin
2026-03-26 11:22:34 -07:00
parent fdfcb50e80
commit 43877a5448
10 changed files with 583 additions and 85 deletions
+97 -18
View File
@@ -1,7 +1,9 @@
package main
import (
"context"
"database/sql"
"encoding/json"
"log"
"net/http"
"os"
@@ -18,11 +20,16 @@ import (
"github.com/nats-io/nats.go"
)
type wsClient struct {
conn *websocket.Conn
send chan []byte
}
var (
wsUpgrader = websocket.Upgrader{
CheckOrigin: func(r *http.Request) bool { return true },
}
wsClients = make(map[*websocket.Conn]bool)
wsClients = make(map[*wsClient]bool)
wsMu sync.RWMutex
natsConn *nats.Conn
)
@@ -31,23 +38,15 @@ func subscribeToNATS(nc *nats.Conn) {
topic := envDefault("NATS_TOPIC", "agentmon.events.v1")
sub, err := nc.Subscribe(topic, func(msg *nats.Msg) {
wsMu.RLock()
var stale []*websocket.Conn
for conn := range wsClients {
err := conn.WriteMessage(websocket.TextMessage, msg.Data)
if err != nil {
conn.Close()
stale = append(stale, conn)
for client := range wsClients {
select {
case client.send <- msg.Data:
default:
// Slow client; close and remove in background.
go removeClient(client)
}
}
wsMu.RUnlock()
if len(stale) > 0 {
wsMu.Lock()
for _, conn := range stale {
delete(wsClients, conn)
}
wsMu.Unlock()
}
})
if err != nil {
log.Printf("failed to subscribe to NATS: %v", err)
@@ -57,19 +56,44 @@ func subscribeToNATS(nc *nats.Conn) {
_ = sub
}
func removeClient(c *wsClient) {
wsMu.Lock()
if wsClients[c] {
delete(wsClients, c)
close(c.send)
c.conn.Close()
}
wsMu.Unlock()
}
func wsHandler(w http.ResponseWriter, r *http.Request) {
conn, err := wsUpgrader.Upgrade(w, r, nil)
if err != nil {
return
}
defer conn.Close()
client := &wsClient{
conn: conn,
send: make(chan []byte, 256),
}
wsMu.Lock()
wsClients[conn] = true
wsClients[client] = true
wsMu.Unlock()
log.Printf("WebSocket client connected")
// Writer goroutine: sole owner of conn writes.
go func() {
defer conn.Close()
for msg := range client.send {
if err := conn.WriteMessage(websocket.TextMessage, msg); err != nil {
break
}
}
}()
// Read loop blocks until the client disconnects.
for {
_, _, err := conn.ReadMessage()
if err != nil {
@@ -78,8 +102,12 @@ func wsHandler(w http.ResponseWriter, r *http.Request) {
}
wsMu.Lock()
delete(wsClients, conn)
if wsClients[client] {
delete(wsClients, client)
close(client.send)
}
wsMu.Unlock()
log.Printf("WebSocket client disconnected")
}
@@ -249,6 +277,36 @@ func main() {
httpx.WriteJSON(w, http.StatusOK, map[string]any{"tools": tools})
})
r.Get("/v1/stats/top-models", func(w http.ResponseWriter, r *http.Request) {
limit, _ := strconv.Atoi(r.URL.Query().Get("limit"))
models, err := db.GetTopModels(r.Context(), limit)
if err != nil {
httpx.WriteJSON(w, http.StatusInternalServerError, map[string]any{"error": "db_error"})
return
}
if models == nil {
models = []postgres.TopModel{}
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"models": models})
})
r.Post("/v1/admin/retention", func(w http.ResponseWriter, r *http.Request) {
var req struct {
Days int `json:"days"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Days <= 0 {
httpx.WriteJSON(w, http.StatusBadRequest, map[string]any{"error": "invalid_request", "message": "days must be a positive integer"})
return
}
cutoff := time.Now().AddDate(0, 0, -req.Days)
deleted, err := db.DeleteOlderThan(r.Context(), cutoff)
if err != nil {
httpx.WriteJSON(w, http.StatusInternalServerError, map[string]any{"error": "db_error"})
return
}
httpx.WriteJSON(w, http.StatusOK, map[string]any{"deleted": deleted, "cutoff": cutoff.Format(time.RFC3339)})
})
r.Get("/v1/stats/timeseries", func(w http.ResponseWriter, r *http.Request) {
window := r.URL.Query().Get("window")
switch window {
@@ -267,6 +325,27 @@ func main() {
httpx.WriteJSON(w, http.StatusOK, timeseries)
})
// Background retention cleanup
retentionDays := 30
if v := os.Getenv("RETENTION_DAYS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
retentionDays = n
}
}
go func() {
ticker := time.NewTicker(24 * time.Hour)
defer ticker.Stop()
for range ticker.C {
cutoff := time.Now().AddDate(0, 0, -retentionDays)
deleted, err := db.DeleteOlderThan(context.Background(), cutoff)
if err != nil {
log.Printf("retention cleanup error: %v", err)
} else if deleted > 0 {
log.Printf("retention cleanup: deleted %d events older than %s", deleted, cutoff.Format(time.RFC3339))
}
}
}()
log.Printf("query-api listening on %s", addr)
log.Fatal(http.ListenAndServe(addr, r))
}
+2
View File
@@ -18,3 +18,5 @@ create index if not exists events_ts_idx on events (ts);
create index if not exists events_session_idx on events (session_id);
create index if not exists events_run_idx on events (run_id);
create index if not exists events_type_ts_idx on events (type, ts);
create index if not exists events_framework_client_ts_idx on events (source_framework, client_id, ts);
create index if not exists events_framework_ts_idx on events (source_framework, ts);
+4 -2
View File
@@ -14,7 +14,9 @@ var validTypes = map[string]bool{
"span.end": true,
"error": true,
"metric.snapshot": true,
"openclaw.snapshot": true,
"openclaw.snapshot": true,
"swarm.snapshot": true,
"swarm.service.snapshot": true,
}
type ValidationError struct {
@@ -62,7 +64,7 @@ func Validate(m map[string]any) error {
}
// Source is optional for openclaw.snapshot events
if eventType != "openclaw.snapshot" {
if eventType != "openclaw.snapshot" && eventType != "swarm.snapshot" && eventType != "swarm.service.snapshot" {
source, ok := event["source"].(map[string]any)
if !ok {
return ValidationError{Field: "event.source", Message: "missing or invalid"}
+51
View File
@@ -54,4 +54,55 @@ on conflict (event_id) do nothing
return err
}
func (d *DB) InsertEventBatch(ctx context.Context, events []InsertEvent) error {
if len(events) == 0 {
return nil
}
if len(events) == 1 {
return d.InsertEvent(ctx, events[0])
}
tx, err := d.sql.BeginTx(ctx, nil)
if err != nil {
return err
}
defer tx.Rollback()
stmt, err := tx.PrepareContext(ctx, `
INSERT INTO events (
event_id, ts, type, session_id, run_id, trace_id, span_id, parent_span_id,
source_framework, client_id, payload
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11)
ON CONFLICT (event_id) DO NOTHING
`)
if err != nil {
return err
}
defer stmt.Close()
for _, e := range events {
payload, err := json.Marshal(e.Payload)
if err != nil {
return err
}
_, err = stmt.ExecContext(ctx, e.EventID, e.TS, e.Type, e.SessionID, e.RunID,
e.TraceID, e.SpanID, e.ParentSpanID, e.SourceFramework, e.ClientID, payload)
if err != nil {
return err
}
}
return tx.Commit()
}
// DeleteOlderThan removes events with ts older than the given cutoff.
// Returns the number of rows deleted.
func (d *DB) DeleteOlderThan(ctx context.Context, cutoff time.Time) (int64, error) {
result, err := d.sql.ExecContext(ctx, `DELETE FROM events WHERE ts < $1`, cutoff)
if err != nil {
return 0, err
}
return result.RowsAffected()
}
var ErrMissingField = errors.New("missing required field")
+5
View File
@@ -3,6 +3,8 @@ package postgres
import (
"context"
"database/sql"
"time"
_ "github.com/jackc/pgx/v5/stdlib"
)
@@ -15,6 +17,9 @@ func Open(url string) (*DB, error) {
if err != nil {
return nil, err
}
db.SetMaxOpenConns(25)
db.SetMaxIdleConns(5)
db.SetConnMaxLifetime(5 * time.Minute)
return &DB{sql: db}, nil
}
+11
View File
@@ -19,6 +19,7 @@ type EventsFilter struct {
EventType string
Framework string
ClientID string
Since *time.Time // if nil, defaults to 24h ago
}
func (d *DB) ListRecentEvents(ctx context.Context, f EventsFilter) ([]EventRow, error) {
@@ -29,10 +30,20 @@ func (d *DB) ListRecentEvents(ctx context.Context, f EventsFilter) ([]EventRow,
f.Limit = 1000
}
since := f.Since
if since == nil {
t := time.Now().Add(-24 * time.Hour)
since = &t
}
query := "SELECT event_id, ts, type, payload FROM events WHERE 1=1"
args := []any{}
argN := 1
query += fmt.Sprintf(" AND ts >= $%d", argN)
args = append(args, *since)
argN++
if f.EventType != "" {
query += fmt.Sprintf(" AND type = $%d", argN)
args = append(args, f.EventType)
+142 -28
View File
@@ -32,7 +32,7 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session
SELECT
session_id,
MIN(ts) as started_at,
MAX(ts) as ended_at,
MAX(CASE WHEN type = 'session.end' THEN ts END) as ended_at,
MAX(source_framework) as framework,
MAX(payload->'event'->'source'->>'host') as host
FROM events
@@ -57,14 +57,18 @@ func (d *DB) GetSessionWithRuns(ctx context.Context, sessionID string) (*Session
run_id,
session_id,
MIN(ts) as started_at,
MAX(ts) as ended_at,
MAX(CASE WHEN type = 'run.end' THEN ts END) as ended_at,
CASE
WHEN bool_or(type = 'error' OR payload->'payload'->>'status' = 'error') THEN 'error'
ELSE 'success'
END as status,
COUNT(DISTINCT span_id) as span_count,
COUNT(DISTINCT CASE WHEN payload->'attributes'->>'span_kind' = 'tool' THEN span_id END) as tool_count,
COALESCE(MAX(CASE WHEN type = 'run.end' THEN payload->'payload'->>'model' END), '') as model
COALESCE(
MAX(CASE WHEN type = 'run.end' THEN payload->'payload'->>'model' END),
MAX(CASE WHEN type = 'metric.snapshot' THEN payload->'payload'->'metrics'->>'model' END),
''
) as model
FROM events
WHERE session_id = $1 AND run_id IS NOT NULL
GROUP BY run_id, session_id
@@ -123,7 +127,7 @@ func (d *DB) GetRunWithSpans(ctx context.Context, runID string) (*RunDetail, []S
run_id,
session_id,
MIN(ts) as started_at,
MAX(ts) as ended_at,
MAX(CASE WHEN type = 'run.end' THEN ts END) as ended_at,
CASE
WHEN bool_or(type = 'error' OR payload->'payload'->>'status' = 'error') THEN 'error'
ELSE 'success'
@@ -148,6 +152,80 @@ func (d *DB) GetRunWithSpans(ctx context.Context, runID string) (*RunDetail, []S
return &run, spans, nil
}
func mergeSpanEvent(existing *SpanRow, s SpanRow) {
if existing.Name == "" && s.Name != "" {
existing.Name = s.Name
}
if existing.Kind == "" || existing.Kind == "unknown" {
existing.Kind = s.Kind
}
if s.Duration != nil {
existing.Duration = s.Duration
}
if s.Status == "error" {
existing.Status = "error"
}
existing.Payload = mergeEnvelopeJSON(existing.Payload, s.Payload)
}
func mergeEnvelopeJSON(existing, next json.RawMessage) json.RawMessage {
if len(existing) == 0 {
return next
}
if len(next) == 0 {
return existing
}
var dst map[string]any
if err := json.Unmarshal(existing, &dst); err != nil {
return next
}
var src map[string]any
if err := json.Unmarshal(next, &src); err != nil {
return existing
}
mergeJSONObjects(dst, src)
merged, err := json.Marshal(dst)
if err != nil {
return next
}
return merged
}
func mergeJSONObjects(dst, src map[string]any) {
for key, value := range src {
srcMap, srcOK := value.(map[string]any)
if !srcOK {
dst[key] = value
continue
}
dstMap, dstOK := dst[key].(map[string]any)
if !dstOK {
dst[key] = srcMap
continue
}
mergeJSONObjects(dstMap, srcMap)
}
}
func findRunIndexForSpan(runs []RunRow, spanStartedAt time.Time) int {
for i := len(runs) - 1; i >= 0; i-- {
run := runs[i]
if spanStartedAt.Before(run.StartedAt) {
continue
}
if run.EndedAt == nil || !spanStartedAt.After(*run.EndedAt) {
return i
}
}
return -1
}
func (d *DB) listSpansForRun(ctx context.Context, runID string) ([]SpanRow, error) {
rows, err := d.sql.QueryContext(ctx, `
SELECT
@@ -187,19 +265,7 @@ func (d *DB) listSpansForRun(ctx context.Context, runID string) ([]SpanRow, erro
continue
}
if existing.Name == "" && s.Name != "" {
existing.Name = s.Name
}
if existing.Kind == "" || existing.Kind == "unknown" {
existing.Kind = s.Kind
}
if s.Duration != nil {
existing.Duration = s.Duration
}
if s.Status == "error" {
existing.Status = "error"
}
existing.Payload = s.Payload
mergeSpanEvent(existing, s)
}
if err := rows.Err(); err != nil {
@@ -214,35 +280,83 @@ func (d *DB) listSpansForRun(ctx context.Context, runID string) ([]SpanRow, erro
}
func (d *DB) attachSpansToRuns(ctx context.Context, sessionID string, runs []RunRow) ([]RunRow, error) {
if len(runs) == 0 {
return runs, nil
}
rows, err := d.sql.QueryContext(ctx, `
SELECT DISTINCT run_id
SELECT
run_id,
span_id,
COALESCE(payload->'attributes'->>'name', payload->'event'->>'type', type) as name,
COALESCE(payload->'attributes'->>'span_kind', 'unknown') as kind,
ts as started_at,
(payload->'payload'->>'duration_ms')::bigint as duration_ms,
CASE WHEN type = 'error' OR payload->'payload'->>'status' = 'error' THEN 'error' ELSE 'success' END as status,
payload
FROM events
WHERE session_id = $1 AND run_id IS NOT NULL
ORDER BY run_id
WHERE session_id = $1 AND span_id IS NOT NULL
ORDER BY ts ASC
`, sessionID)
if err != nil {
return nil, err
}
defer rows.Close()
spansByRun := make(map[string][]SpanRow)
// Map of run_id -> (map of span_id -> *SpanRow) for merging
type runSpans struct {
byID map[string]*SpanRow
order []string
}
spansByRun := make(map[string]*runSpans)
for rows.Next() {
var runID string
if err := rows.Scan(&runID); err != nil {
var s SpanRow
var runID *string
if err := rows.Scan(&runID, &s.SpanID, &s.Name, &s.Kind, &s.StartedAt, &s.Duration, &s.Status, &s.Payload); err != nil {
return nil, err
}
spans, err := d.listSpansForRun(ctx, runID)
if err != nil {
return nil, err
if runID != nil {
s.RunID = *runID
}
spansByRun[runID] = spans
if s.RunID == "" {
runIndex := findRunIndexForSpan(runs, s.StartedAt)
if runIndex == -1 {
continue
}
s.RunID = runs[runIndex].RunID
}
rs := spansByRun[s.RunID]
if rs == nil {
rs = &runSpans{byID: make(map[string]*SpanRow)}
spansByRun[s.RunID] = rs
}
existing := rs.byID[s.SpanID]
if existing == nil {
copy := s
rs.byID[s.SpanID] = &copy
rs.order = append(rs.order, s.SpanID)
continue
}
mergeSpanEvent(existing, s)
}
if err := rows.Err(); err != nil {
return nil, err
}
for i := range runs {
runs[i].Spans = spansByRun[runs[i].RunID]
rs := spansByRun[runs[i].RunID]
if rs == nil {
continue
}
spans := make([]SpanRow, 0, len(rs.order))
for _, spanID := range rs.order {
spans = append(spans, *rs.byID[spanID])
}
runs[i].Spans = spans
}
return runs, nil
}
+130
View File
@@ -0,0 +1,130 @@
package postgres
import (
"encoding/json"
"testing"
"time"
)
func TestMergeEnvelopeJSON_MergesNestedPayloads(t *testing.T) {
existing := json.RawMessage(`{
"event":{"type":"span.start"},
"attributes":{"name":"tool.call","span_kind":"tool"},
"payload":{"input":{"query":"status"},"prompt_preview":"summarize"}
}`)
next := json.RawMessage(`{
"event":{"type":"span.end"},
"payload":{"result_preview":"ok","duration_ms":42}
}`)
merged := mergeEnvelopeJSON(existing, next)
var got map[string]any
if err := json.Unmarshal(merged, &got); err != nil {
t.Fatalf("unmarshal merged payload: %v", err)
}
payload, ok := got["payload"].(map[string]any)
if !ok {
t.Fatal("expected merged payload object")
}
if _, ok := payload["input"].(map[string]any); !ok {
t.Fatal("expected input from first event to be preserved")
}
if payload["result_preview"] != "ok" {
t.Fatalf("expected result_preview to be merged, got %#v", payload["result_preview"])
}
if payload["duration_ms"] != float64(42) {
t.Fatalf("expected duration_ms to be merged, got %#v", payload["duration_ms"])
}
attrs, ok := got["attributes"].(map[string]any)
if !ok || attrs["name"] != "tool.call" {
t.Fatalf("expected attributes to be preserved, got %#v", got["attributes"])
}
event, ok := got["event"].(map[string]any)
if !ok || event["type"] != "span.end" {
t.Fatalf("expected later event metadata to win, got %#v", got["event"])
}
}
func TestMergeSpanEvent_PreservesStartPayloadDetails(t *testing.T) {
existing := &SpanRow{
Name: "tool.call",
Kind: "tool",
Status: "success",
Payload: json.RawMessage(`{
"attributes":{"name":"tool.call","span_kind":"tool"},
"payload":{"input":{"command":"ls"}}
}`),
}
mergeSpanEvent(existing, SpanRow{
Status: "success",
Payload: json.RawMessage(`{
"payload":{"result_preview":"done","duration_ms":12}
}`),
})
var got map[string]any
if err := json.Unmarshal(existing.Payload, &got); err != nil {
t.Fatalf("unmarshal merged span payload: %v", err)
}
payload := got["payload"].(map[string]any)
if _, ok := payload["input"].(map[string]any); !ok {
t.Fatal("expected input to remain after merge")
}
if payload["result_preview"] != "done" {
t.Fatalf("expected result_preview after merge, got %#v", payload["result_preview"])
}
}
func TestFindRunIndexForSpan_MatchesContainingRunWindow(t *testing.T) {
start := time.Date(2026, 3, 23, 10, 0, 0, 0, time.UTC)
run1End := start.Add(2 * time.Minute)
run2Start := start.Add(3 * time.Minute)
run2End := start.Add(7 * time.Minute)
runs := []RunRow{
{
RunID: "run-1",
StartedAt: start,
EndedAt: &run1End,
},
{
RunID: "run-2",
StartedAt: run2Start,
EndedAt: &run2End,
},
}
idx := findRunIndexForSpan(runs, start.Add(4*time.Minute))
if idx != 1 {
t.Fatalf("expected span to attach to run-2, got index %d", idx)
}
}
func TestFindRunIndexForSpan_MatchesOpenRun(t *testing.T) {
start := time.Date(2026, 3, 23, 10, 0, 0, 0, time.UTC)
run1End := start.Add(2 * time.Minute)
run2Start := start.Add(3 * time.Minute)
runs := []RunRow{
{
RunID: "run-1",
StartedAt: start,
EndedAt: &run1End,
},
{
RunID: "run-2",
StartedAt: run2Start,
},
}
idx := findRunIndexForSpan(runs, start.Add(5*time.Minute))
if idx != 1 {
t.Fatalf("expected span to attach to open run-2, got index %d", idx)
}
}
+40 -20
View File
@@ -12,6 +12,7 @@ type SessionRow struct {
StartedAt time.Time `json:"started_at"`
EndedAt *time.Time `json:"ended_at,omitempty"`
Framework string `json:"framework"`
ClientID string `json:"client_id,omitempty"`
Host string `json:"host"`
RunCount int `json:"run_count"`
}
@@ -33,8 +34,10 @@ func (d *DB) ListSessions(ctx context.Context, f SessionsFilter) ([]SessionRow,
f.Limit = 200
}
// Build query dynamically
var conditions []string
// Build query dynamically using a CTE so cursor compares against
// the grouped started_at rather than individual event timestamps.
var innerConditions []string
var outerConditions []string
var args []any
argN := 1
@@ -43,50 +46,63 @@ func (d *DB) ListSessions(ctx context.Context, f SessionsFilter) ([]SessionRow,
t := time.Now().Add(-24 * time.Hour)
f.From = &t
}
conditions = append(conditions, fmt.Sprintf("ts >= $%d", argN))
innerConditions = append(innerConditions, fmt.Sprintf("ts >= $%d", argN))
args = append(args, *f.From)
argN++
if f.To != nil {
conditions = append(conditions, fmt.Sprintf("ts <= $%d", argN))
innerConditions = append(innerConditions, fmt.Sprintf("ts <= $%d", argN))
args = append(args, *f.To)
argN++
}
if f.Framework != "" {
conditions = append(conditions, fmt.Sprintf("source_framework = $%d", argN))
innerConditions = append(innerConditions, fmt.Sprintf("source_framework = $%d", argN))
args = append(args, f.Framework)
argN++
}
// Host filter applies to an aggregate, so it goes in the outer WHERE
if f.Host != "" {
conditions = append(conditions, fmt.Sprintf("payload->'event'->'source'->>'host' = $%d", argN))
outerConditions = append(outerConditions, fmt.Sprintf("host = $%d", argN))
args = append(args, f.Host)
argN++
}
// Cursor compares against grouped started_at, so it goes in the outer WHERE
if f.Cursor != nil {
conditions = append(conditions, fmt.Sprintf("ts < $%d", argN))
outerConditions = append(outerConditions, fmt.Sprintf("started_at < $%d", argN))
args = append(args, *f.Cursor)
argN++
}
where := strings.Join(conditions, " AND ")
innerWhere := strings.Join(innerConditions, " AND ")
outerWhere := ""
if len(outerConditions) > 0 {
outerWhere = "WHERE " + strings.Join(outerConditions, " AND ")
}
query := fmt.Sprintf(`
SELECT
session_id,
MIN(ts) as started_at,
MAX(ts) as ended_at,
MAX(source_framework) as framework,
MAX(payload->'event'->'source'->>'host') as host,
COUNT(DISTINCT run_id) as run_count
FROM events
WHERE session_id IS NOT NULL AND %s
GROUP BY session_id
WITH session_groups AS (
SELECT
session_id,
MIN(ts) as started_at,
MAX(CASE WHEN type = 'session.end' THEN ts END) as ended_at,
MAX(source_framework) as framework,
MAX(client_id) as client_id,
MAX(payload->'event'->'source'->>'host') as host,
COUNT(DISTINCT run_id) as run_count
FROM events
WHERE session_id IS NOT NULL AND %s
GROUP BY session_id
)
SELECT session_id, started_at, ended_at, framework, client_id, host, run_count
FROM session_groups
%s
ORDER BY started_at DESC
LIMIT $%d
`, where, argN)
`, innerWhere, outerWhere, argN)
args = append(args, f.Limit+1) // fetch one extra to detect next page
rows, err := d.sql.QueryContext(ctx, query, args...)
@@ -98,10 +114,14 @@ func (d *DB) ListSessions(ctx context.Context, f SessionsFilter) ([]SessionRow,
var out []SessionRow
for rows.Next() {
var r SessionRow
var clientID *string
var host *string
if err := rows.Scan(&r.SessionID, &r.StartedAt, &r.EndedAt, &r.Framework, &host, &r.RunCount); err != nil {
if err := rows.Scan(&r.SessionID, &r.StartedAt, &r.EndedAt, &r.Framework, &clientID, &host, &r.RunCount); err != nil {
return nil, nil, err
}
if clientID != nil {
r.ClientID = *clientID
}
if host != nil {
r.Host = *host
}
+101 -17
View File
@@ -16,14 +16,22 @@ type Summary struct {
RunsToday int `json:"runs_today"`
ToolCallsToday int `json:"tool_calls_today"`
ErrorsToday int `json:"errors_today"`
TokensToday int64 `json:"tokens_today"`
CostToday float64 `json:"cost_today"`
AvgDurationMS float64 `json:"avg_duration_ms"`
ByFramework map[string]FrameworkStats `json:"by_framework"`
}
type TimeseriesBucket struct {
TS time.Time `json:"ts"`
Runs int `json:"runs"`
Tools int `json:"tools"`
Errors int `json:"errors"`
TS time.Time `json:"ts"`
Runs int `json:"runs"`
Tools int `json:"tools"`
Errors int `json:"errors"`
Tokens int64 `json:"tokens"`
InputTokens int64 `json:"input_tokens"`
OutputTokens int64 `json:"output_tokens"`
Cost float64 `json:"cost"`
AvgDurationMS float64 `json:"avg_duration_ms"`
}
type TimeseriesResult struct {
@@ -36,21 +44,23 @@ func (d *DB) GetSummary(ctx context.Context) (*Summary, error) {
now := time.Now()
midnight := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location())
// Active sessions: sessions with a session.start but no session.end (ever)
// Active sessions: sessions with a session.start but no session.end (last 7 days)
activeQ := `
SELECT COUNT(DISTINCT session_id)
FROM events
WHERE type = 'session.start'
AND session_id IS NOT NULL
AND session_id NOT IN (
SELECT DISTINCT session_id
FROM events
WHERE type = 'session.end'
AND session_id IS NOT NULL
SELECT COUNT(DISTINCT e.session_id)
FROM events e
WHERE e.type = 'session.start'
AND e.session_id IS NOT NULL
AND e.ts >= $1
AND NOT EXISTS (
SELECT 1
FROM events e2
WHERE e2.type = 'session.end'
AND e2.session_id = e.session_id
)
`
var activeSessions int
if err := d.sql.QueryRowContext(ctx, activeQ).Scan(&activeSessions); err != nil {
activeSessionsSince := time.Now().Add(-7 * 24 * time.Hour)
if err := d.sql.QueryRowContext(ctx, activeQ, activeSessionsSince).Scan(&activeSessions); err != nil {
return nil, err
}
@@ -64,6 +74,7 @@ func (d *DB) GetSummary(ctx context.Context) (*Summary, error) {
COUNT(*) FILTER (WHERE type = 'error') AS errors
FROM events
WHERE ts >= $1
AND type IN ('run.start', 'span.end', 'error')
GROUP BY source_framework
`
rows, err := d.sql.QueryContext(ctx, fwQ, midnight)
@@ -90,11 +101,30 @@ func (d *DB) GetSummary(ctx context.Context) (*Summary, error) {
return nil, err
}
// Usage stats for today (tokens, cost, avg latency)
usageQ := `
SELECT
COALESCE(SUM((payload->'payload'->'usage'->>'total_tokens')::bigint), 0),
COALESCE(SUM((payload->'payload'->'usage'->>'total_cost')::float8), 0),
COALESCE(AVG((payload->'payload'->>'duration_ms')::float8), 0)
FROM events
WHERE type = 'run.end'
AND ts >= $1
`
var tokensToday int64
var costToday, avgDurationMS float64
if err := d.sql.QueryRowContext(ctx, usageQ, midnight).Scan(&tokensToday, &costToday, &avgDurationMS); err != nil {
return nil, err
}
return &Summary{
ActiveSessions: activeSessions,
RunsToday: totalRuns,
ToolCallsToday: totalTools,
ErrorsToday: totalErrors,
TokensToday: tokensToday,
CostToday: costToday,
AvgDurationMS: avgDurationMS,
ByFramework: byFramework,
}, nil
}
@@ -104,6 +134,11 @@ type TopTool struct {
Count int `json:"count"`
}
type TopModel struct {
Name string `json:"name"`
Count int `json:"count"`
}
func (d *DB) GetTopTools(ctx context.Context, limit int) ([]TopTool, error) {
if limit <= 0 {
limit = 10
@@ -141,6 +176,43 @@ func (d *DB) GetTopTools(ctx context.Context, limit int) ([]TopTool, error) {
return out, rows.Err()
}
func (d *DB) GetTopModels(ctx context.Context, limit int) ([]TopModel, error) {
if limit <= 0 {
limit = 10
}
now := time.Now()
midnight := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location())
q := `
SELECT
payload->'payload'->>'model' AS model_name,
COUNT(*) AS cnt
FROM events
WHERE type = 'run.end'
AND payload->'payload'->>'model' IS NOT NULL
AND payload->'payload'->>'model' <> ''
AND ts >= $1
GROUP BY model_name
ORDER BY cnt DESC
LIMIT $2
`
rows, err := d.sql.QueryContext(ctx, q, midnight, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var out []TopModel
for rows.Next() {
var m TopModel
if err := rows.Scan(&m.Name, &m.Count); err != nil {
return nil, err
}
out = append(out, m)
}
return out, rows.Err()
}
func bucketForWindow(window string) string {
switch window {
case "1h":
@@ -191,9 +263,20 @@ func (d *DB) GetTimeseries(ctx context.Context, window string) (*TimeseriesResul
COUNT(*) FILTER (WHERE type = 'run.start') AS runs,
COUNT(*) FILTER (WHERE type = 'span.end'
AND payload->'attributes'->>'span_kind' = 'tool') AS tools,
COUNT(*) FILTER (WHERE type = 'error') AS errors
COUNT(*) FILTER (WHERE type = 'error') AS errors,
COALESCE(SUM((payload->'payload'->'usage'->>'total_tokens')::bigint)
FILTER (WHERE type = 'run.end'), 0) AS tokens,
COALESCE(SUM((payload->'payload'->'usage'->>'input_tokens')::bigint)
FILTER (WHERE type = 'run.end'), 0) AS input_tokens,
COALESCE(SUM((payload->'payload'->'usage'->>'output_tokens')::bigint)
FILTER (WHERE type = 'run.end'), 0) AS output_tokens,
COALESCE(SUM((payload->'payload'->'usage'->>'total_cost')::float8)
FILTER (WHERE type = 'run.end'), 0) AS cost,
COALESCE(AVG((payload->'payload'->>'duration_ms')::float8)
FILTER (WHERE type = 'run.end'), 0) AS avg_duration_ms
FROM events
WHERE ts >= $2
AND type IN ('run.start', 'run.end', 'span.end', 'error')
GROUP BY bucket_ts
ORDER BY bucket_ts ASC
`
@@ -207,7 +290,8 @@ func (d *DB) GetTimeseries(ctx context.Context, window string) (*TimeseriesResul
var series []TimeseriesBucket
for rows.Next() {
var b TimeseriesBucket
if err := rows.Scan(&b.TS, &b.Runs, &b.Tools, &b.Errors); err != nil {
if err := rows.Scan(&b.TS, &b.Runs, &b.Tools, &b.Errors,
&b.Tokens, &b.InputTokens, &b.OutputTokens, &b.Cost, &b.AvgDurationMS); err != nil {
return nil, err
}
series = append(series, b)