feat(npu): add advisory metrics to utilization digest

Roll up confidence, recommendation, authority, fallback, and service-level metrics, including v1 authority-flag handling.
This commit is contained in:
William Valentin
2026-06-06 15:30:31 -07:00
parent dae2a57124
commit 72434c8bc3
3 changed files with 254 additions and 4 deletions
+1 -1
View File
@@ -33,7 +33,7 @@ scripts/npu-utilization-digest.py --format jsonl --no-write
python -m pytest tests/test_npu_utilization_digest.py -q
```
Output shape is intentionally small: service booleans, counts, average probe ms, sysfs deltas, proof flags, fallback warning counts, artifact path, and closed gates. `fallbacks` includes unavailable services, failed/missing proof, and skipped proof-capable smokes such as disabled Whisper/doc-triage probes or GenAI cold-load skips; intentionally health-only RAG/advisory rows are not fallbacks unless unavailable. It does not print raw embeddings, transcripts, OCR text, model completions, request headers, or full upstream JSON.
Output shape is intentionally small: service booleans, request counts by service, average probe ms, sysfs/NPU busy deltas by service, proof flags, fallback totals and per-service fallback counts, confidence distribution, escalation/suppression recommendation counts, authority-safe flag violation totals, artifact path, and closed gates. `fallbacks` includes unavailable services, failed/missing proof, and skipped proof-capable smokes such as disabled Whisper/doc-triage probes or GenAI cold-load skips; intentionally health-only RAG/advisory rows are not fallbacks unless unavailable. It does not print raw embeddings, transcripts, OCR text, model completions, request headers, or full upstream JSON.
Covered rows:
+177 -1
View File
@@ -72,6 +72,10 @@ class ServiceRow:
dry_run: bool | None = None
suppress: int | None = None
escalate: int | None = None
recommendation: str | None = None
confidence: float | None = None
confidence_bucket: str | None = None
authority_violations: int | None = None
loaded: bool | None = None
allowed_roots_count: int | None = None
reason: str | None = None
@@ -83,6 +87,136 @@ def compact_dict(obj: Any) -> dict[str, Any]:
return {k: v for k, v in data.items() if v is not None and v != []}
AUTHORITY_SAFE_ACTIONS = {
"", "none", "log", "observe", "dry_run", "recommend", "suppress", "escalate",
"record_metric", "compare_with_expected_label", "include_in_digest",
"open_review_ticket_candidate", "recommend_human_review",
}
AUTHORITY_FLAG_KEYS = {
"advisory_post",
"atlas_routing",
"broad_private_scan",
"delivery_send",
"gateway_restart",
"live_routing",
"memory_write",
"outbound_send",
"private_root_scan",
"service_restart",
"tool_execution",
"vector_mutation",
}
AUTHORITY_FLAG_ALIASES = {
"can_route_atlas": "atlas_routing",
"can_write_memory": "memory_write",
"can_execute_tools": "tool_execution",
"can_restart_services": "service_restart",
"can_send_outbound": "outbound_send",
"can_scan_private_roots": "private_root_scan",
"can_mutate_vector_store": "vector_mutation",
"can_post_advisory_event": "advisory_post",
"can_change_gateway_config": "gateway_restart",
"may_route": "atlas_routing",
"may_write_memory": "memory_write",
"may_execute_tools": "tool_execution",
"may_restart_services": "service_restart",
"may_send_external": "outbound_send",
"may_process_private_dirs": "private_root_scan",
"may_mutate_vector_db": "vector_mutation",
"may_change_live_config": "gateway_restart",
}
def confidence_bucket(confidence: float | None) -> str | None:
if confidence is None:
return None
if confidence >= 0.8:
return "high"
if confidence >= 0.5:
return "medium"
return "low"
def coerce_confidence(value: Any) -> float | None:
if isinstance(value, bool):
return None
if isinstance(value, (int, float)):
return max(0.0, min(1.0, float(value)))
if isinstance(value, str):
try:
return max(0.0, min(1.0, float(value)))
except ValueError:
return None
return None
def extract_confidence(payload: dict[str, Any]) -> float | None:
direct = coerce_confidence(payload.get("confidence"))
if direct is not None:
return direct
raw_labels = payload.get("labels")
labels: dict[str, Any] = raw_labels if isinstance(raw_labels, dict) else {}
scores: list[float] = []
for value in labels.values():
if isinstance(value, dict):
for score_key in ("confidence", "score", "probability"):
if score_key in value:
score = coerce_confidence(value.get(score_key))
break
score = None
else:
score = coerce_confidence(value)
if score is not None:
scores.append(score)
return max(scores) if scores else None
def extract_recommendation(payload: dict[str, Any]) -> str | None:
for key in ("recommendation", "classification", "input_class"):
value = payload.get(key)
if isinstance(value, str) and value:
return value[:48]
raw_action = payload.get("action")
action: dict[str, Any] = raw_action if isinstance(raw_action, dict) else {}
value = action.get("recommendation") or action.get("type")
return str(value)[:48] if value else None
def count_authority_violations(payload: dict[str, Any]) -> int:
"""Count advisory response hints that would exceed read-only/dry-run authority.
Supports both legacy compact payloads and `npu_advisory_decision_v1`.
Valid schema-safe allowed actions and object-shaped no-op actual actions must
not count as violations; any true live-authority flag must count.
"""
violations = 0
raw_flags = payload.get("authority_flags")
flags: dict[str, Any] = raw_flags if isinstance(raw_flags, dict) else {}
for key, value in flags.items():
canonical = AUTHORITY_FLAG_ALIASES.get(key, key)
if canonical in AUTHORITY_FLAG_KEYS and bool(value):
violations += 1
raw_allowed = payload.get("allowed_actions")
allowed: list[Any] = raw_allowed if isinstance(raw_allowed, list) else []
for action in allowed:
if str(action).lower() not in AUTHORITY_SAFE_ACTIONS:
violations += 1
raw_actual = payload.get("actual_action")
if isinstance(raw_actual, dict):
performed = bool(raw_actual.get("performed"))
side_effects = raw_actual.get("side_effects") or []
kind = str(raw_actual.get("kind") or "none").lower()
if performed or side_effects or kind not in AUTHORITY_SAFE_ACTIONS | {"recorded_metric", "dry_run_reported"}:
violations += 1
else:
actual = str(raw_actual or "").lower()
if actual and actual not in AUTHORITY_SAFE_ACTIONS:
violations += 1
return violations
def read_busy(path: Path = BUSY_PATH) -> int | None:
try:
return int(path.read_text().strip())
@@ -234,6 +368,12 @@ def probe_classifier(timeout: float, busy_path: Path = BUSY_PATH, post_json: Cal
action: dict[str, Any] = raw_action if isinstance(raw_action, dict) else {}
row.escalate = int(bool(action.get("escalate") or labels.get("action_required") or labels.get("tool_needed")))
row.suppress = int(bool(action.get("suppress") or labels.get("no_op") or labels.get("duplicate")))
row.recommendation = extract_recommendation(data) or ("escalate" if row.escalate else "suppress" if row.suppress else "log")
row.confidence = extract_confidence(data)
row.confidence_bucket = confidence_bucket(row.confidence)
row.authority_violations = count_authority_violations(data)
if row.authority_violations:
row.warnings.append("authority_violation")
row.items = len(labels)
apply_proof(row, delta)
if not row.reachable:
@@ -387,10 +527,28 @@ def build_summary(rows: list[ServiceRow], artifact_path: str | None, counter_del
proof_ok = sum(1 for r in proof_rows if r.proof_ok)
gates_closed = sum(1 for r in rows if str(r.gate).startswith("closed:"))
fallbacks = sum(r.fallbacks for r in rows)
request_counts_by_service = {r.service: r.calls for r in rows if r.calls}
npu_busy_delta_us_by_service = {r.service: r.npu_delta_us for r in rows if r.npu_delta_us is not None}
fallbacks_by_service = {r.service: r.fallbacks for r in rows if r.fallbacks}
recommendation_counts = {"escalate": 0, "suppress": 0}
confidence_distribution: dict[str, int] = {"low": 0, "medium": 0, "high": 0, "unknown": 0}
authority_violations = 0
warnings: dict[str, int] = {}
for row in rows:
recommendation = (row.recommendation or "").lower()
if recommendation in recommendation_counts:
recommendation_counts[recommendation] += 1
else:
recommendation_counts["escalate"] += row.escalate or 0
recommendation_counts["suppress"] += row.suppress or 0
if row.confidence_bucket:
confidence_distribution[row.confidence_bucket] = confidence_distribution.get(row.confidence_bucket, 0) + 1
elif row.recommendation or row.escalate is not None or row.suppress is not None:
confidence_distribution["unknown"] += 1
authority_violations += row.authority_violations or 0
for warning in row.warnings:
warnings[warning] = warnings.get(warning, 0) + 1
confidence_distribution = {k: v for k, v in confidence_distribution.items() if v}
return {
"type": "summary",
"timestamp": started_at,
@@ -401,6 +559,12 @@ def build_summary(rows: list[ServiceRow], artifact_path: str | None, counter_del
"proof_ok": proof_ok,
"proof_total": len(proof_rows),
"fallbacks": fallbacks,
"fallbacks_by_service": fallbacks_by_service,
"request_counts_by_service": request_counts_by_service,
"npu_busy_delta_us_by_service": npu_busy_delta_us_by_service,
"confidence_distribution": confidence_distribution,
"recommendation_counts": {k: v for k, v in recommendation_counts.items() if v},
"authority_violations": authority_violations,
"gates_closed": gates_closed,
"warnings": warnings,
"artifact": artifact_path,
@@ -411,8 +575,14 @@ def render_text(summary: dict[str, Any], rows: list[ServiceRow]) -> str:
lines = [
f"NPU utilization digest {summary['timestamp']}",
f"counter={summary['counter']} delta_us={summary.get('delta_us')}",
f"services_ok={summary['services_ok']}/{summary['services_total']} proof_ok={summary['proof_ok']}/{summary['proof_total']} fallbacks={summary['fallbacks']} gates_closed={summary['gates_closed']}",
f"services_ok={summary['services_ok']}/{summary['services_total']} proof_ok={summary['proof_ok']}/{summary['proof_total']} fallbacks={summary['fallbacks']} authority_violations={summary['authority_violations']} gates_closed={summary['gates_closed']}",
]
rec_counts = summary.get("recommendation_counts") or {}
if rec_counts:
lines.append("recommendations: " + " ".join(f"{k}={v}" for k, v in sorted(rec_counts.items())))
conf_dist = summary.get("confidence_distribution") or {}
if conf_dist:
lines.append("confidence: " + " ".join(f"{k}={v}" for k, v in sorted(conf_dist.items())))
for r in rows:
parts = [f"- {r.service}:", f"ok={str(r.reachable).lower()}"]
if r.calls:
@@ -437,6 +607,12 @@ def render_text(summary: dict[str, Any], rows: list[ServiceRow]) -> str:
parts.append(f"suppress={r.suppress}")
if r.escalate is not None:
parts.append(f"escalate={r.escalate}")
if r.recommendation is not None:
parts.append(f"recommendation={r.recommendation}")
if r.confidence_bucket is not None:
parts.append(f"confidence={r.confidence_bucket}")
if r.authority_violations is not None:
parts.append(f"authority_violations={r.authority_violations}")
if r.loaded is not None:
parts.append(f"loaded={str(r.loaded).lower()}")
if r.allowed_roots_count is not None:
+76 -2
View File
@@ -67,7 +67,15 @@ def test_classifier_dry_run_payload(tmp_path, monkeypatch):
def fake_post(url, payload, timeout):
seen.update(payload)
busy.write_text("35")
return 200, {"labels": {"tool_needed": True, "duplicate": False}, "npu_busy_delta_us": 25}
return 200, {
"labels": {"tool_needed": True, "duplicate": False},
"recommendation": "escalate",
"confidence": 0.84,
"authority_flags": {"tool_execution": False, "memory_write": False},
"allowed_actions": ["log", "recommend"],
"actual_action": "dry_run",
"npu_busy_delta_us": 25,
}
monkeypatch.setattr(digest, "health_row", fake_health)
row = digest.probe_classifier(1, busy_path=busy, post_json=fake_post)
@@ -75,6 +83,10 @@ def test_classifier_dry_run_payload(tmp_path, monkeypatch):
assert seen["options"]["include_evidence"] is False
assert row.escalate == 1
assert row.suppress == 0
assert row.recommendation == "escalate"
assert row.confidence == 0.84
assert row.confidence_bucket == "high"
assert row.authority_violations == 0
assert row.proof_ok is True
@@ -145,15 +157,77 @@ def test_disabled_proof_smokes_count_as_fallbacks(monkeypatch):
def test_jsonl_shape(tmp_path):
rows = [digest.ServiceRow(service="embeddings", reachable=True, probe_ran=True, proof_ok=True, npu_delta_us=1)]
rows = [digest.ServiceRow(service="embeddings", reachable=True, probe_ran=True, proof_ok=True, calls=1, npu_delta_us=1)]
summary = digest.build_summary(rows, None, 1, "2026-06-05T14:20:00-07:00")
path = digest.write_jsonl(summary, rows, tmp_path)
lines = [json.loads(line) for line in path.read_text().splitlines()]
assert lines[0]["type"] == "summary"
assert lines[0]["request_counts_by_service"] == {"embeddings": 1}
assert lines[0]["npu_busy_delta_us_by_service"] == {"embeddings": 1}
assert lines[1]["type"] == "service"
assert lines[1]["service"] == "embeddings"
def test_summary_observability_rollups_and_text():
rows = [
digest.ServiceRow(service="classifier", reachable=True, calls=1, npu_delta_us=25, fallbacks=0, escalate=1, suppress=0, recommendation="escalate", confidence=0.84, confidence_bucket="high", authority_violations=0),
digest.ServiceRow(service="doc_triage", reachable=True, calls=1, npu_delta_us=7, fallbacks=1, warnings=["no_positive_sysfs_delta"]),
digest.ServiceRow(service="advisory_gateway", reachable=True, gate="closed:advisory-post", authority_violations=1, warnings=["authority_violation"]),
]
summary = digest.build_summary(rows, None, 32, "2026-06-05T14:20:00-07:00")
assert summary["request_counts_by_service"] == {"classifier": 1, "doc_triage": 1}
assert summary["npu_busy_delta_us_by_service"] == {"classifier": 25, "doc_triage": 7}
assert summary["fallbacks_by_service"] == {"doc_triage": 1}
assert summary["confidence_distribution"] == {"high": 1}
assert summary["recommendation_counts"] == {"escalate": 1}
assert summary["authority_violations"] == 1
text = digest.render_text(summary, rows)
assert "authority_violations=1" in text
assert "recommendations: escalate=1" in text
assert "confidence: high=1" in text
def test_authority_violation_detection():
assert digest.count_authority_violations({
"authority_flags": {"tool_execution": True, "memory_write": False},
"allowed_actions": ["log", "service_restart"],
"actual_action": "outbound_send",
}) == 3
def test_v1_authority_violation_detection():
safe_payload = {
"authority_flags": {
"can_route_atlas": False,
"can_write_memory": False,
"can_execute_tools": False,
"can_restart_services": False,
"can_send_outbound": False,
"can_scan_private_roots": False,
"can_mutate_vector_store": False,
"can_post_advisory_event": False,
"can_change_gateway_config": False,
"requires_human_approval": True,
"advisory_only": True,
},
"allowed_actions": ["record_metric", "compare_with_expected_label", "include_in_digest", "recommend_human_review"],
"actual_action": {"kind": "dry_run_reported", "performed": False, "performed_by": "harness", "side_effects": []},
}
assert digest.count_authority_violations(safe_payload) == 0
unsafe = dict(safe_payload)
unsafe["authority_flags"] = dict(safe_payload["authority_flags"], can_execute_tools=True)
assert digest.count_authority_violations(unsafe) == 1
def test_recommendation_only_and_zero_confidence_rollups():
payload = {"labels": {"no_op": {"confidence": 0.0, "score": 0.9}}, "recommendation": "suppress"}
assert digest.extract_confidence(payload) == 0.0
row = digest.ServiceRow(service="classifier", reachable=True, recommendation="suppress", confidence=0.0, confidence_bucket="low")
summary = digest.build_summary([row], None, None, "2026-06-05T14:20:00-07:00")
assert summary["recommendation_counts"] == {"suppress": 1}
assert summary["confidence_distribution"] == {"low": 1}
def test_exit_codes(monkeypatch):
rows = [digest.ServiceRow(service="embeddings", reachable=True, probe_ran=True, proof_ok=False, warnings=["no_positive_sysfs_delta"])]
summary = digest.build_summary(rows, None, 0, "2026-06-05T14:20:00-07:00")