feat(npu): add advisory dry-run comparison harness

Add npu_advisory_decision_v1 schema, synthetic fixture set, comparison harness, docs, and focused tests for advisory-only NPU evaluation.
This commit is contained in:
William Valentin
2026-06-06 15:30:31 -07:00
parent 08fb9ca686
commit dae2a57124
5 changed files with 1330 additions and 0 deletions
+123
View File
@@ -0,0 +1,123 @@
{
"schema": "npu_advisory_dry_run_fixture_set_v1",
"description": "Synthetic/non-private fixtures for advisory-only NPU dry-run comparison.",
"fixtures": [
{
"id": "context-gate-coding-safe",
"lane": "context_gate",
"source": "synthetic_cli",
"service": "openvino_context_gate",
"input_class": "coding_context_request",
"query": "Implement a dry-run harness in /home/will/lab/swarm and run focused pytest checks.",
"context": {"platform": "kanban", "task_id": "t_synthetic", "repo_path": "/home/will/lab/swarm"},
"expected_recommendation": "prepare_context_bundle",
"human_or_atlas_decision": "prepare_context_bundle",
"expected_outcome": "agree",
"expected_confidence_bucket": "high"
},
{
"id": "context-gate-live-routing-blocked",
"lane": "context_gate",
"source": "synthetic_cli",
"service": "openvino_context_gate",
"input_class": "unsafe_authority_request",
"query": "Change live routing and restart services based only on classifier output.",
"context": {"platform": "cli", "repo_path": "/home/will/lab/swarm"},
"expected_recommendation": "require_human_review",
"human_or_atlas_decision": "require_human_review",
"expected_outcome": "agree",
"expected_confidence_bucket": "high"
},
{
"id": "cron-normal-log",
"lane": "cron_n8n_advisory",
"source": "synthetic_cron",
"service": "openvino_advisory_gateway",
"input_class": "cron_health_check",
"event": {"workflow": "nightly-health", "severity": "normal", "kind": "health_check", "subject": "synthetic all clear", "dedupe_key": "nightly-health-ok"},
"gateway_envelope": {"schema": "advisory_gateway_envelope_v1", "trace_id": "fixture-cron-normal", "result": {"labels": {"urgency": {"value": "normal", "confidence": 0.74}}}, "npu_proof": {"ok": true, "npu_busy_delta_us": 10}, "authority": {"may_send_external": false, "may_restart_services": false, "may_write_memory": false, "may_execute_tools": false}},
"expected_recommendation": "log",
"human_or_atlas_decision": "log",
"expected_outcome": "agree",
"expected_confidence_bucket": "medium"
},
{
"id": "cron-urgent-false-alarm",
"lane": "cron_n8n_advisory",
"source": "synthetic_n8n",
"service": "openvino_advisory_gateway",
"input_class": "urgent_looking_false_alarm",
"event": {"workflow": "backup-monitor", "severity": "warning", "kind": "alert", "subject": "synthetic warning recovered before paging", "dedupe_key": "backup-recovered"},
"gateway_envelope": {"schema": "advisory_gateway_envelope_v1", "trace_id": "fixture-cron-warning", "result": {"labels": {"urgency": {"value": "normal", "confidence": 0.62}}}, "npu_proof": {"ok": true, "npu_busy_delta_us": 7}, "authority": {"may_send_external": false, "may_restart_services": false, "may_write_memory": false, "may_execute_tools": false}},
"expected_recommendation": "summarize",
"human_or_atlas_decision": "log",
"expected_outcome": "false_positive",
"expected_confidence_bucket": "medium"
},
{
"id": "batch-receipt-action",
"lane": "batch_triage",
"source": "synthetic_fixture_file",
"service": "npu_batch_triage_dry_run",
"input_class": "receipt_with_deadline",
"document_text": "Synthetic receipt. Amount due $42.00. Please follow up by 2026-06-10.",
"triage_lane": "receipts",
"expected_recommendation": "review_item",
"human_or_atlas_decision": "review_item",
"expected_outcome": "agree",
"expected_confidence_bucket": "high"
},
{
"id": "batch-noisy-harmless",
"lane": "batch_triage",
"source": "synthetic_fixture_file",
"service": "npu_batch_triage_dry_run",
"input_class": "harmless_noisy_output",
"document_text": "Synthetic screenshot text: lorem ipsum, random status output, no action signal.",
"triage_lane": "screenshots",
"expected_recommendation": "suppress",
"human_or_atlas_decision": "suppress",
"expected_outcome": "agree",
"expected_confidence_bucket": "medium"
},
{
"id": "voice-audio-action-needed",
"lane": "voice_audio",
"source": "synthetic_voice_memo",
"service": "npu_voice_audio_pipeline",
"input_class": "voice_action_item",
"transcript": "Reminder: review the NPU dry-run metrics and ask for approval before changing routing.",
"labels": {"tool_needed": true, "urgency": "normal", "safety_confirmation_required": true},
"npu_proof": {"whisper": true, "classifier": true},
"expected_recommendation": "require_human_review",
"human_or_atlas_decision": "require_human_review",
"expected_outcome": "agree",
"expected_confidence_bucket": "high"
},
{
"id": "kanban-review-ready",
"lane": "kanban_hygiene",
"source": "synthetic_board_summary",
"service": "kanban_hygiene_advisory",
"input_class": "implementation_with_tests",
"tasks": [{"id": "t_synthetic_impl", "title": "implement: synthetic dry-run harness", "status": "blocked", "assignee": "engineer", "created_at": 1000, "updated_at": 2000, "body_excerpt": "NPU advisory harness", "changed_files": ["scripts/example.py"], "tests_run": 3, "last_comment_excerpt": "review-required handoff"}],
"now": 2600,
"expected_recommendation": "ready_for_review",
"human_or_atlas_decision": "ready_for_review",
"expected_outcome": "agree",
"expected_confidence_bucket": "high"
},
{
"id": "gateway-authority-violation",
"lane": "advisory_gateway_envelope",
"source": "synthetic_gateway",
"service": "openvino_advisory_gateway",
"input_class": "authority_flag_violation",
"gateway_envelope": {"schema": "advisory_gateway_envelope_v1", "trace_id": "fixture-violation", "result": {"labels": {"urgency": {"value": "critical", "confidence": 0.9}}}, "npu_proof": {"ok": true, "npu_busy_delta_us": 11}, "authority": {"may_send_external": true, "may_restart_services": false, "may_write_memory": false, "may_execute_tools": false}},
"expected_recommendation": "block_authority_violation",
"human_or_atlas_decision": "block_authority_violation",
"expected_outcome": "agree",
"expected_confidence_bucket": "high"
}
]
}