feat(metrics): surface tool-span latency in stats and dashboard

Tool spans already carry duration_ms and status, but the metrics layer only counted them. Expose that data: - GetTopTools now returns avg/p95 duration and error count per tool. - Timeseries buckets gain tool_avg_ms / tool_p95_ms (filtered percentile_cont over tool spans). - Dashboard Top Tools shows avg latency per tool; the Latency panel, previously always empty (it read run-level duration that is never emitted), now plots real tool-span latency (min/avg/p95). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-23 11:16:23 -07:00
parent c44e7fe72e
commit 5014d89258
2 changed files with 48 additions and 17 deletions
@@ -488,6 +488,10 @@ function tallyTool(evt) {
    if (attrs.span_kind === 'tool') {
      const name = attrs.name || 'unknown';
      dashboardState.toolCounts[name] = (dashboardState.toolCounts[name] || 0) + 1;
+      const dur = Number(getEnvelopePayload(evt).duration_ms) || 0;
+      if (dur > 0) {
+        dashboardState.toolDurations[name] = (dashboardState.toolDurations[name] || 0) + dur;
+      }
    }
  }
 }
@@ -618,33 +622,34 @@ function renderLatencyPanel() {
    return;
  }

-  const durSeries = ts.series.map(b => b.avg_duration_ms || 0).filter(v => v > 0);
-  if (durSeries.length === 0) {
-    container.innerHTML = '<p class="empty-state" style="padding:1rem">No run latency recorded yet</p>';
+  const latencyBuckets = ts.series.filter(b => (b.tool_avg_ms || 0) > 0);
+  if (latencyBuckets.length === 0) {
+    container.innerHTML = '<p class="empty-state" style="padding:1rem">No tool latency recorded yet</p>';
    return;
  }

+  const durSeries = latencyBuckets.map(b => b.tool_avg_ms || 0);
  const avg = durSeries.reduce((a, b) => a + b, 0) / durSeries.length;
  const min = Math.min(...durSeries);
-  const max = Math.max(...durSeries);
-  const maxBar = max || 1;
+  const p95 = Math.max(...latencyBuckets.map(b => b.tool_p95_ms || 0));
+  const maxBar = Math.max(...durSeries) || 1;

  container.innerHTML = `
    <div class="latency-panel">
      <div class="latency-range">
        ${metricPill({ label: 'Min', value: formatDuration(min), variant: 'range' })}
        ${metricPill({ label: 'Avg', value: formatDuration(avg), variant: 'range' })}
-        ${metricPill({ label: 'Max', value: formatDuration(max), variant: 'range' })}
+        ${metricPill({ label: 'P95', value: formatDuration(p95), variant: 'range' })}
      </div>
      <div class="latency-mini-bars">
-        ${durSeries.map((v, i) => {
+        ${latencyBuckets.map(b => {
+          const v = b.tool_avg_ms || 0;
          const pct = (v / maxBar * 100).toFixed(1);
-          const label = ts.series.filter(b => b.avg_duration_ms > 0)[i];
-          const title = label ? formatBucketLabel(label.ts) + ': ' + formatDuration(v) : formatDuration(v);
+          const title = formatBucketLabel(b.ts) + ': ' + formatDuration(v);
          return `<div class="latency-mini-bar" style="height:${pct}%" title="${escapeHTML(title)}"></div>`;
        }).join('')}
      </div>
-      <div class="am-pill-label" style="margin-top:0.5rem">Avg run duration per bucket (${escapeHTML(ts.bucket || '-')})</div>
+      <div class="am-pill-label" style="margin-top:0.5rem">Avg tool latency per bucket (${escapeHTML(ts.bucket || '-')})</div>
    </div>
  `;
 }
@@ -749,7 +754,14 @@ function renderDashTopTools() {
  const topTools = Object.entries(dashboardState.toolCounts)
    .sort((a, b) => b[1] - a[1])
    .slice(0, 10)
-    .map(([name, count]) => ({ name, count }));
+    .map(([name, count]) => {
+      const durSum = dashboardState.toolDurations[name] || 0;
+      const avg = count > 0 ? durSum / count : 0;
+      const countDisplay = avg > 0
+        ? `${formatCount(count)} · ${formatDuration(avg)}`
+        : formatCount(count);
+      return { name, count, countDisplay };
+    });
  list.innerHTML = barRankList(topTools, { emptyText: 'No tool data yet' });
 }

@@ -776,6 +788,7 @@ export async function renderDashboard(routeToken) {
    recentEvents: [],
    recentEventIDs: new Set(),
    toolCounts: {},
+    toolDurations: {},
    modelCounts: {},
    rightPanelMode: localStorage.getItem('agentmon:dash:right-panel') || 'framework',
  };
@@ -955,6 +968,7 @@ export async function renderDashboard(routeToken) {

    for (const t of (topToolsData.tools || [])) {
      dashboardState.toolCounts[t.name] = t.count;
+      dashboardState.toolDurations[t.name] = (t.avg_ms || 0) * (t.count || 0);
    }
    for (const m of (topModelsData.models || [])) {
      dashboardState.modelCounts[m.name] = m.count;
@@ -33,6 +33,8 @@ type TimeseriesBucket struct {
 	OutputTokens  int64     `json:"output_tokens"`
 	Cost          float64   `json:"cost"`
 	AvgDurationMS float64   `json:"avg_duration_ms"`
+	ToolAvgMS     float64   `json:"tool_avg_ms"`
+	ToolP95MS     float64   `json:"tool_p95_ms"`
 }

 type TimeseriesResult struct {
@@ -157,8 +159,11 @@ func (d *DB) GetSummary(ctx context.Context) (*Summary, error) {
 }

 type TopTool struct {
-	Name  string `json:"name"`
-	Count int    `json:"count"`
+	Name   string  `json:"name"`
+	Count  int     `json:"count"`
+	AvgMS  float64 `json:"avg_ms"`
+	P95MS  float64 `json:"p95_ms"`
+	Errors int     `json:"errors"`
 }

 type TopModel struct {
@@ -176,7 +181,11 @@ func (d *DB) GetTopTools(ctx context.Context, limit int) ([]TopTool, error) {
 	q := `
 		SELECT
 			payload->'attributes'->>'name' AS tool_name,
-			COUNT(*)                        AS cnt
+			COUNT(*)                        AS cnt,
+			COALESCE(AVG((payload->'payload'->>'duration_ms')::float8), 0) AS avg_ms,
+			COALESCE(percentile_cont(0.95) WITHIN GROUP (
+			         ORDER BY (payload->'payload'->>'duration_ms')::float8), 0) AS p95_ms,
+			COUNT(*) FILTER (WHERE payload->'payload'->>'status' = 'error') AS errors
 		FROM events
 		WHERE type = 'span.end'
 		  AND payload->'attributes'->>'span_kind' = 'tool'
@@ -195,7 +204,7 @@ func (d *DB) GetTopTools(ctx context.Context, limit int) ([]TopTool, error) {
 	var out []TopTool
 	for rows.Next() {
 		var t TopTool
-		if err := rows.Scan(&t.Name, &t.Count); err != nil {
+		if err := rows.Scan(&t.Name, &t.Count, &t.AvgMS, &t.P95MS, &t.Errors); err != nil {
 			return nil, err
 		}
 		out = append(out, t)
@@ -300,7 +309,14 @@ func (d *DB) GetTimeseries(ctx context.Context, window string) (*TimeseriesResul
 			COALESCE(SUM((payload->'payload'->'usage'->>'total_cost')::float8)
 			         FILTER (WHERE type = 'run.end'), 0)           AS cost,
 			COALESCE(AVG((payload->'payload'->>'duration_ms')::float8)
-			         FILTER (WHERE type = 'run.end'), 0)           AS avg_duration_ms
+			         FILTER (WHERE type = 'run.end'), 0)           AS avg_duration_ms,
+			COALESCE(AVG((payload->'payload'->>'duration_ms')::float8)
+			         FILTER (WHERE type = 'span.end'
+			                   AND payload->'attributes'->>'span_kind' = 'tool'), 0) AS tool_avg_ms,
+			COALESCE(percentile_cont(0.95) WITHIN GROUP (
+			         ORDER BY (payload->'payload'->>'duration_ms')::float8)
+			         FILTER (WHERE type = 'span.end'
+			                   AND payload->'attributes'->>'span_kind' = 'tool'), 0) AS tool_p95_ms
 		FROM events
 		WHERE ts >= $2
 		  AND type IN ('run.start', 'run.end', 'span.end', 'error')
@@ -318,7 +334,8 @@ func (d *DB) GetTimeseries(ctx context.Context, window string) (*TimeseriesResul
 	for rows.Next() {
 		var b TimeseriesBucket
 		if err := rows.Scan(&b.TS, &b.Runs, &b.Tools, &b.Errors,
-			&b.Tokens, &b.InputTokens, &b.OutputTokens, &b.Cost, &b.AvgDurationMS); err != nil {
+			&b.Tokens, &b.InputTokens, &b.OutputTokens, &b.Cost, &b.AvgDurationMS,
+			&b.ToolAvgMS, &b.ToolP95MS); err != nil {
 			return nil, err
 		}
 		series = append(series, b)