feat(npu): add advisory metrics to utilization digest

Roll up confidence, recommendation, authority, fallback, and service-level metrics, including v1 authority-flag handling.
2026-06-06 15:30:31 -07:00
parent dae2a57124
commit 72434c8bc3
3 changed files with 254 additions and 4 deletions
@@ -67,7 +67,15 @@ def test_classifier_dry_run_payload(tmp_path, monkeypatch):
    def fake_post(url, payload, timeout):
        seen.update(payload)
        busy.write_text("35")
-        return 200, {"labels": {"tool_needed": True, "duplicate": False}, "npu_busy_delta_us": 25}
+        return 200, {
+            "labels": {"tool_needed": True, "duplicate": False},
+            "recommendation": "escalate",
+            "confidence": 0.84,
+            "authority_flags": {"tool_execution": False, "memory_write": False},
+            "allowed_actions": ["log", "recommend"],
+            "actual_action": "dry_run",
+            "npu_busy_delta_us": 25,
+        }

    monkeypatch.setattr(digest, "health_row", fake_health)
    row = digest.probe_classifier(1, busy_path=busy, post_json=fake_post)
@@ -75,6 +83,10 @@ def test_classifier_dry_run_payload(tmp_path, monkeypatch):
    assert seen["options"]["include_evidence"] is False
    assert row.escalate == 1
    assert row.suppress == 0
+    assert row.recommendation == "escalate"
+    assert row.confidence == 0.84
+    assert row.confidence_bucket == "high"
+    assert row.authority_violations == 0
    assert row.proof_ok is True


@@ -145,15 +157,77 @@ def test_disabled_proof_smokes_count_as_fallbacks(monkeypatch):


 def test_jsonl_shape(tmp_path):
-    rows = [digest.ServiceRow(service="embeddings", reachable=True, probe_ran=True, proof_ok=True, npu_delta_us=1)]
+    rows = [digest.ServiceRow(service="embeddings", reachable=True, probe_ran=True, proof_ok=True, calls=1, npu_delta_us=1)]
    summary = digest.build_summary(rows, None, 1, "2026-06-05T14:20:00-07:00")
    path = digest.write_jsonl(summary, rows, tmp_path)
    lines = [json.loads(line) for line in path.read_text().splitlines()]
    assert lines[0]["type"] == "summary"
+    assert lines[0]["request_counts_by_service"] == {"embeddings": 1}
+    assert lines[0]["npu_busy_delta_us_by_service"] == {"embeddings": 1}
    assert lines[1]["type"] == "service"
    assert lines[1]["service"] == "embeddings"


+def test_summary_observability_rollups_and_text():
+    rows = [
+        digest.ServiceRow(service="classifier", reachable=True, calls=1, npu_delta_us=25, fallbacks=0, escalate=1, suppress=0, recommendation="escalate", confidence=0.84, confidence_bucket="high", authority_violations=0),
+        digest.ServiceRow(service="doc_triage", reachable=True, calls=1, npu_delta_us=7, fallbacks=1, warnings=["no_positive_sysfs_delta"]),
+        digest.ServiceRow(service="advisory_gateway", reachable=True, gate="closed:advisory-post", authority_violations=1, warnings=["authority_violation"]),
+    ]
+    summary = digest.build_summary(rows, None, 32, "2026-06-05T14:20:00-07:00")
+    assert summary["request_counts_by_service"] == {"classifier": 1, "doc_triage": 1}
+    assert summary["npu_busy_delta_us_by_service"] == {"classifier": 25, "doc_triage": 7}
+    assert summary["fallbacks_by_service"] == {"doc_triage": 1}
+    assert summary["confidence_distribution"] == {"high": 1}
+    assert summary["recommendation_counts"] == {"escalate": 1}
+    assert summary["authority_violations"] == 1
+    text = digest.render_text(summary, rows)
+    assert "authority_violations=1" in text
+    assert "recommendations: escalate=1" in text
+    assert "confidence: high=1" in text
+
+
+def test_authority_violation_detection():
+    assert digest.count_authority_violations({
+        "authority_flags": {"tool_execution": True, "memory_write": False},
+        "allowed_actions": ["log", "service_restart"],
+        "actual_action": "outbound_send",
+    }) == 3
+
+
+def test_v1_authority_violation_detection():
+    safe_payload = {
+        "authority_flags": {
+            "can_route_atlas": False,
+            "can_write_memory": False,
+            "can_execute_tools": False,
+            "can_restart_services": False,
+            "can_send_outbound": False,
+            "can_scan_private_roots": False,
+            "can_mutate_vector_store": False,
+            "can_post_advisory_event": False,
+            "can_change_gateway_config": False,
+            "requires_human_approval": True,
+            "advisory_only": True,
+        },
+        "allowed_actions": ["record_metric", "compare_with_expected_label", "include_in_digest", "recommend_human_review"],
+        "actual_action": {"kind": "dry_run_reported", "performed": False, "performed_by": "harness", "side_effects": []},
+    }
+    assert digest.count_authority_violations(safe_payload) == 0
+    unsafe = dict(safe_payload)
+    unsafe["authority_flags"] = dict(safe_payload["authority_flags"], can_execute_tools=True)
+    assert digest.count_authority_violations(unsafe) == 1
+
+
+def test_recommendation_only_and_zero_confidence_rollups():
+    payload = {"labels": {"no_op": {"confidence": 0.0, "score": 0.9}}, "recommendation": "suppress"}
+    assert digest.extract_confidence(payload) == 0.0
+    row = digest.ServiceRow(service="classifier", reachable=True, recommendation="suppress", confidence=0.0, confidence_bucket="low")
+    summary = digest.build_summary([row], None, None, "2026-06-05T14:20:00-07:00")
+    assert summary["recommendation_counts"] == {"suppress": 1}
+    assert summary["confidence_distribution"] == {"low": 1}
+
+
 def test_exit_codes(monkeypatch):
    rows = [digest.ServiceRow(service="embeddings", reachable=True, probe_ran=True, proof_ok=False, warnings=["no_positive_sysfs_delta"])]
    summary = digest.build_summary(rows, None, 0, "2026-06-05T14:20:00-07:00")