feat(npu): add explicit-root batch triage wrapper

This commit is contained in:
William Valentin
2026-06-05 15:52:43 -07:00
parent 6155b54ab5
commit 6906c2079b
7 changed files with 888 additions and 0 deletions
+52
View File
@@ -0,0 +1,52 @@
version: 1
policy:
default_mode: dry_run
require_explicit_root: true
allow_external_uploads: false
allow_mutations: false
log_raw_text: false
include_full_paths_default: false
npu_proof_path: /sys/class/accel/accel0/device/npu_busy_time_us
# Copy to config/triage-roots.local.yaml and approve exactly one narrow,
# lane-specific staging root. The committed template is intentionally
# unapproved/fail-closed; do not point any lane at broad home, Downloads,
# vault, screenshot, photo-library, or historical audio roots without explicit
# approval for that exact lane/root.
roots:
screenshots:
approved: false
root: null
allowed_extensions: [.png, .jpg, .jpeg, .webp, .heic]
max_files: 50
max_file_mb: 25
receipts:
approved: false
root: null
allowed_extensions: [.png, .jpg, .jpeg, .pdf, .webp]
max_files: 50
max_file_mb: 25
downloads:
approved: false
root: null
allowed_extensions: [.pdf, .png, .jpg, .jpeg, .webp]
max_files: 50
max_file_mb: 25
obsidian_attachments:
approved: false
root: null
allowed_extensions: [.pdf, .png, .jpg, .jpeg, .webp, .mp3, .m4a, .wav, .ogg]
max_files: 50
max_file_mb: 50
voice_memos:
approved: false
root: null
allowed_extensions: [.mp3, .m4a, .wav, .ogg, .opus]
max_files: 25
max_file_mb: 100
meeting_snippets:
approved: false
root: null
allowed_extensions: [.mp3, .m4a, .wav, .ogg, .opus]
max_files: 25
max_file_mb: 200
+46
View File
@@ -0,0 +1,46 @@
version: 1
policy:
default_mode: dry_run
require_explicit_root: true
allow_external_uploads: false
allow_mutations: false
log_raw_text: false
include_full_paths_default: false
npu_proof_path: /sys/class/accel/accel0/device/npu_busy_time_us
roots:
screenshots:
approved: true
root: ../openvino-doc-image-triage-npu/samples
allowed_extensions: [.png, .jpg, .jpeg, .webp, .heic]
max_files: 50
max_file_mb: 25
receipts:
approved: true
root: ../openvino-doc-image-triage-npu/samples
allowed_extensions: [.png, .jpg, .jpeg, .pdf, .webp]
max_files: 50
max_file_mb: 25
downloads:
approved: true
root: ../openvino-doc-image-triage-npu/samples
allowed_extensions: [.pdf, .png, .jpg, .jpeg, .webp]
max_files: 50
max_file_mb: 25
obsidian_attachments:
approved: true
root: ../openvino-doc-image-triage-npu/samples
allowed_extensions: [.pdf, .png, .jpg, .jpeg, .webp, .mp3, .m4a, .wav, .ogg]
max_files: 50
max_file_mb: 50
voice_memos:
approved: true
root: ../tmp/synthetic-voice-memos
allowed_extensions: [.mp3, .m4a, .wav, .ogg, .opus]
max_files: 25
max_file_mb: 100
meeting_snippets:
approved: true
root: ../tmp/synthetic-meeting-snippets
allowed_extensions: [.mp3, .m4a, .wav, .ogg, .opus]
max_files: 25
max_file_mb: 200
+65
View File
@@ -0,0 +1,65 @@
# Explicit-root NPU batch triage dry-run examples
These examples are wrappers only. They do not install cron jobs, enable services,
change Atlas/Hermes routing, write Obsidian/RAG/vector DBs, move/delete files, or
send outbound messages.
The committed manifest template at `config/triage-roots.example.yaml` is
intentionally unapproved. For real private data, copy it to
`config/triage-roots.local.yaml` and approve exactly one narrow lane-specific
staging folder. Request-level `--root` may narrow that manifest root but cannot
broaden it.
Synthetic document/image smoke, CPU-only/no NPU claim:
```bash
python scripts/npu-batch-triage-dry-run.py \
--manifest config/triage-roots.test.yaml \
--lane screenshots \
--root openvino-doc-image-triage-npu/samples \
--limit 5 \
--dry-run \
--no-npu \
--json
```
Synthetic document/image smoke with the existing local embeddings NPU service,
if `127.0.0.1:18817` is healthy. Treat NPU as proven only when `npu.proof_ok` is
true and `npu.busy_delta_us` (or item-level delta) is positive:
```bash
python scripts/npu-batch-triage-dry-run.py \
--manifest config/triage-roots.test.yaml \
--lane receipts \
--root openvino-doc-image-triage-npu/samples \
--limit 5 \
--dry-run \
--json
```
Audio smoke should use generated/public synthetic audio only until a private
audio staging root is approved:
```bash
python scripts/npu-batch-triage-dry-run.py \
--manifest config/triage-roots.test.yaml \
--lane voice_memos \
--root tmp/synthetic-voice-memos \
--limit 3 \
--dry-run \
--no-npu \
--json
```
Cron/n8n shape (disabled example only):
```text
Manual Trigger / disabled cron
-> Execute Command: python /home/will/lab/swarm/scripts/npu-batch-triage-dry-run.py --manifest /home/will/lab/swarm/config/triage-roots.local.yaml --lane receipts --limit 25 --dry-run --json
-> IF ok && npu.proof_ok && files_processed > 0
-> local dashboard/report only
```
Do not connect this output to Telegram/Discord/email sends, Obsidian writes,
RAG/vector reindex, file moves/deletes, Kanban mutation, service restarts, or
Atlas/Hermes routing without a separate reviewed approval gate.
+523
View File
@@ -0,0 +1,523 @@
#!/usr/bin/env python3
"""Explicit-root dry-run batch triage for local documents, images, and audio.
This wrapper is intentionally report-only. It requires a lane-scoped approved
root in a manifest, rejects request roots that broaden that approval, redacts raw
text/transcripts by default, and never mutates Obsidian, RAG/vector DBs, files,
routing, memory, services, or sends.
"""
from __future__ import annotations
import argparse
import datetime as dt
import hashlib
import ipaddress
import importlib.util
import json
import mimetypes
import os
import re
import sys
import time
import urllib.error
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Any
try:
import yaml # type: ignore
except Exception as exc: # pragma: no cover
raise SystemExit("PyYAML is required to read triage root manifests") from exc
LANES = (
"screenshots",
"receipts",
"downloads",
"obsidian_attachments",
"voice_memos",
"meeting_snippets",
)
AUDIO_LANES = {"voice_memos", "meeting_snippets"}
DOC_IMAGE_LANES = {"screenshots", "receipts", "downloads", "obsidian_attachments"}
SKIP_DIR_NAMES = {".git", ".obsidian", "__pycache__", ".cache", "cache", "chroma", "chromadb", "vector_db", "vectors"}
NPU_BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
DEFAULT_WHISPER_URL = "http://127.0.0.1:18816/v1/audio/transcriptions"
MUTATIONS_FALSE = {
"obsidian": False,
"rag": False,
"vector_db": False,
"sends": False,
"file_moves": False,
"routing": False,
"memory": False,
"service_restarts": False,
}
ACTION_PATTERNS = {
"follow_up": re.compile(r"\b(follow up|follow-up|circle back|reply|respond)\b", re.I),
"date_or_deadline": re.compile(r"\b(deadline|due|by (?:mon|tue|wed|thu|fri|sat|sun)|20\d{2}[-/]\d{1,2}[-/]\d{1,2})\b", re.I),
"decision": re.compile(r"\b(decided|decision|approved|rejected|go with|choose)\b", re.I),
"task": re.compile(r"\b(todo|to-do|action item|assign|need to|please)\b", re.I),
}
class FailClosed(Exception):
pass
def sha256_text(text: str) -> str:
return "sha256:" + hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
def sha256_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return "sha256:" + h.hexdigest()
def read_busy(path: Path = NPU_BUSY_PATH) -> int | None:
try:
return int(path.read_text().strip())
except Exception:
return None
def validate_local_whisper_url(whisper_url: str) -> str:
"""Fail closed unless Whisper transcription stays on the approved loopback service."""
try:
parsed = urllib.parse.urlparse(whisper_url)
port = parsed.port
except ValueError as exc:
raise FailClosed("whisper_url_invalid") from exc
if parsed.scheme != "http":
raise FailClosed("whisper_url_scheme_not_http")
if parsed.username or parsed.password:
raise FailClosed("whisper_url_credentials_not_allowed")
if port != 18816:
raise FailClosed("whisper_url_port_not_approved")
host = (parsed.hostname or "").strip().lower()
if host == "localhost":
return whisper_url
try:
if ipaddress.ip_address(host).is_loopback:
return whisper_url
except ValueError:
pass
raise FailClosed("whisper_url_not_loopback")
def is_under(path: Path, root: Path) -> bool:
try:
path.resolve().relative_to(root.resolve())
return True
except ValueError:
return False
def load_manifest(path: Path) -> dict[str, Any]:
if not path.exists():
raise FailClosed(f"manifest_missing:{path}")
data = yaml.safe_load(path.read_text())
if not isinstance(data, dict):
raise FailClosed("manifest_invalid:not_mapping")
if data.get("version") != 1:
raise FailClosed("manifest_invalid:version_must_be_1")
policy = data.get("policy") or {}
if policy.get("default_mode", "dry_run") != "dry_run":
raise FailClosed("policy_invalid:default_mode_not_dry_run")
for key, expected in {
"require_explicit_root": True,
"allow_external_uploads": False,
"allow_mutations": False,
"log_raw_text": False,
}.items():
if policy.get(key) is not expected:
raise FailClosed(f"policy_invalid:{key}")
if not isinstance(data.get("roots"), dict):
raise FailClosed("manifest_invalid:roots_missing")
return data
def resolve_lane_root(manifest: dict[str, Any], manifest_path: Path, lane: str, requested_root: str | None) -> tuple[dict[str, Any], Path, Path]:
lane_cfg = (manifest.get("roots") or {}).get(lane)
if not isinstance(lane_cfg, dict):
raise FailClosed(f"lane_missing:{lane}")
if lane_cfg.get("approved") is not True:
raise FailClosed(f"lane_unapproved:{lane}")
root_value = lane_cfg.get("root")
if not root_value:
raise FailClosed(f"root_missing:{lane}")
approved_root = Path(str(root_value)).expanduser()
if not approved_root.is_absolute():
approved_root = (manifest_path.parent / approved_root).resolve()
else:
approved_root = approved_root.resolve()
if not approved_root.exists() or not approved_root.is_dir():
raise FailClosed(f"approved_root_unavailable:{lane}")
selected_root = Path(requested_root).expanduser() if requested_root else approved_root
selected_root = selected_root.resolve()
if not selected_root.exists() or not selected_root.is_dir():
raise FailClosed(f"request_root_unavailable:{lane}")
if not is_under(selected_root, approved_root):
raise FailClosed(f"request_root_broadens_approval:{lane}")
return lane_cfg, approved_root, selected_root
def allowed_exts(lane_cfg: dict[str, Any]) -> set[str]:
return {str(e).lower() if str(e).startswith(".") else "." + str(e).lower() for e in lane_cfg.get("allowed_extensions", [])}
def iter_files(root: Path, approved_root: Path, exts: set[str], max_file_mb: float, max_age_days: float | None) -> tuple[list[Path], dict[str, int], int]:
skipped = {"extension": 0, "size": 0, "symlink_escape": 0, "not_regular_file": 0, "too_old": 0, "policy": 0}
accepted: list[Path] = []
files_seen = 0
now = time.time()
max_bytes = int(max_file_mb * 1024 * 1024)
for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
dirnames[:] = [d for d in dirnames if d not in SKIP_DIR_NAMES and not d.startswith(".")]
current = Path(dirpath)
if not is_under(current, approved_root):
skipped["symlink_escape"] += 1
dirnames[:] = []
continue
for name in filenames:
path = current / name
if name.startswith("."):
skipped["policy"] += 1
continue
files_seen += 1
try:
resolved = path.resolve()
except Exception:
skipped["symlink_escape"] += 1
continue
if not is_under(resolved, approved_root):
skipped["symlink_escape"] += 1
continue
if not resolved.is_file():
skipped["not_regular_file"] += 1
continue
if resolved.suffix.lower() not in exts:
skipped["extension"] += 1
continue
try:
st = resolved.stat()
except OSError:
skipped["not_regular_file"] += 1
continue
if st.st_size > max_bytes:
skipped["size"] += 1
continue
if max_age_days is not None and now - st.st_mtime > max_age_days * 86400:
skipped["too_old"] += 1
continue
accepted.append(resolved)
accepted.sort(key=lambda p: p.stat().st_mtime, reverse=True)
return accepted, skipped, files_seen
def load_doc_triage_module(repo_root: Path):
module_path = repo_root / "openvino-doc-image-triage-npu" / "triage.py"
spec = importlib.util.spec_from_file_location("doc_image_triage", module_path)
if spec is None or spec.loader is None:
raise RuntimeError("doc_image_triage_import_failed")
module = importlib.util.module_from_spec(spec)
sys.modules.setdefault("doc_image_triage", module)
spec.loader.exec_module(module) # type: ignore[union-attr]
return module
def fallback_doc_item(path: Path, root: Path, lane: str) -> dict[str, Any]:
sidecar = path.with_suffix(path.suffix + ".txt")
text = ""
if sidecar.exists() and sidecar.is_file():
text = sidecar.read_text(errors="replace")[:12000]
lower = text.lower()
category = "unknown_or_low_confidence"
if any(w in lower for w in ("receipt", "subtotal", "store")):
category = "receipt"
elif any(w in lower for w in ("invoice", "amount due", "payment due")):
category = "bill_or_invoice"
elif lane == "screenshots":
category = "screenshot_web_or_app"
reasons = [name for name, rx in ACTION_PATTERNS.items() if rx.search(text)]
return {
"basename": path.name,
"relative_path_hash": sha256_text(path.relative_to(root).as_posix()),
"file_id": sha256_file(path),
"media_type": infer_media_type(path),
"category": category,
"needs_attention": bool(reasons),
"reasons": sorted(reasons),
"raw_text_redacted": True,
"full_path_included": False,
"metadata": {"dates_count": len(set(re.findall(r"\b20\d{2}[-/]\d{1,2}[-/]\d{1,2}\b", text))), "amounts_count": len(set(re.findall(r"\$\s?\d+(?:\.\d{2})?", text))), "raw_values_redacted": True},
"processing": {"doc_image_triage": "fallback_cpu_sidecar_rules", "npu_verified": False},
}
def infer_media_type(path: Path) -> str:
if path.suffix.lower() == ".pdf":
return "pdf"
mt, _ = mimetypes.guess_type(path.name)
if mt and mt.startswith("image/"):
return "image"
if mt and mt.startswith("audio/"):
return "audio"
return "unknown"
def compact_doc_item(path: Path, root: Path, lane: str, triage_result: dict[str, Any]) -> dict[str, Any]:
pages = triage_result.get("pages") or []
first = pages[0] if pages else {}
cls = first.get("classification") or {}
attn = first.get("needs_attention") or {}
meta = first.get("metadata") or {}
device_summary = triage_result.get("processing_device_summary") or {}
item = {
"basename": path.name,
"relative_path_hash": sha256_text(path.relative_to(root).as_posix()),
"file_id": triage_result.get("file_id") or sha256_file(path),
"media_type": triage_result.get("media_type") or infer_media_type(path),
"category": cls.get("label") or "unknown_or_low_confidence",
"needs_attention": bool(attn.get("value")),
"reasons": attn.get("reasons") or [],
"raw_text_redacted": True,
"full_path_included": False,
"metadata": {
"dates_count": meta.get("dates_count", 0),
"amounts_count": meta.get("amounts_count", 0),
"raw_values_redacted": True,
},
"processing": {
"doc_image_triage": "openvino-doc-image-triage-npu",
"image_category_device": (cls.get("device") or "CPU"),
"needs_attention_device": attn.get("device") or "CPU",
"npu_verified": bool(device_summary.get("npu_verified")),
"npu_busy_delta_us": device_summary.get("npu_busy_delta_us"),
},
}
if lane == "receipts":
item["receipt_fields"] = {"vendor_present": bool((meta.get("detected_entities") or {}).get("org_present")), "amounts_count": item["metadata"]["amounts_count"], "dates_count": item["metadata"]["dates_count"]}
return item
def classify_transcript(text: str, lane: str) -> dict[str, Any]:
reasons = [name for name, rx in ACTION_PATTERNS.items() if rx.search(text)]
action_count = sum(1 for rx in (ACTION_PATTERNS["follow_up"], ACTION_PATTERNS["task"]) if rx.search(text))
decisions = 1 if ACTION_PATTERNS["decision"].search(text) else 0
followups = 1 if ACTION_PATTERNS["follow_up"].search(text) else 0
return {
"category": "meeting_snippet" if lane == "meeting_snippets" else "voice_memo",
"action_worthy": bool(reasons),
"reasons": sorted(reasons),
"action_items_count": action_count,
"decisions_count": decisions,
"followups_count": followups,
}
def multipart_transcribe(path: Path, whisper_url: str, timeout: float) -> dict[str, Any]:
whisper_url = validate_local_whisper_url(whisper_url)
boundary = "----NpuBatchTriage" + hashlib.sha256(path.name.encode()).hexdigest()[:12]
data = path.read_bytes()
body = (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="file"; filename="{path.name}"\r\n'
"Content-Type: application/octet-stream\r\n\r\n"
).encode() + data + (
f"\r\n--{boundary}\r\n"
'Content-Disposition: form-data; name="model"\r\n\r\n'
"whisper-1\r\n"
f"--{boundary}--\r\n"
).encode()
before = read_busy()
req = urllib.request.Request(whisper_url, data=body, headers={"Content-Type": f"multipart/form-data; boundary={boundary}"})
t0 = time.perf_counter()
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read(1024 * 1024)
status = resp.status
parsed = json.loads(raw.decode())
after = read_busy()
text = str(parsed.get("text") or parsed.get("transcription") or "").strip()
service_delta = parsed.get("npu_busy_delta_us")
sysfs_delta = None if before is None or after is None else after - before
proof_delta = service_delta if isinstance(service_delta, int) else sysfs_delta
return {
"ok": status == 200 and bool(text),
"text": text,
"transcript_chars": len(text),
"duration_seconds": parsed.get("duration_seconds"),
"language": parsed.get("language"),
"npu_busy_delta_us": proof_delta,
"verified_npu": bool(proof_delta and proof_delta > 0),
"wall_ms": round((time.perf_counter() - t0) * 1000, 2),
}
def compact_audio_item(path: Path, root: Path, lane: str, no_npu: bool, whisper_url: str, timeout: float) -> dict[str, Any]:
transcript = ""
transcribed = False
npu_delta = 0
proof_ok = False
duration = None
language = None
error = None
if not no_npu:
try:
result = multipart_transcribe(path, whisper_url, timeout)
transcript = result["text"]
transcribed = result["ok"]
npu_delta = result.get("npu_busy_delta_us") or 0
proof_ok = bool(result.get("verified_npu"))
duration = result.get("duration_seconds")
language = result.get("language")
except (urllib.error.URLError, TimeoutError, OSError, json.JSONDecodeError) as exc:
error = f"whisper_error:{type(exc).__name__}"
summary = classify_transcript(transcript, lane)
item = {
"basename": path.name,
"relative_path_hash": sha256_text(path.relative_to(root).as_posix()),
"file_id": sha256_file(path),
"media_type": "audio",
"duration_seconds": duration,
"transcribed": transcribed,
"transcript_chars": len(transcript),
"language": language,
**summary,
"npu_busy_delta_us": npu_delta,
"raw_transcript_logged": False,
"full_path_included": False,
}
if error:
item["error"] = error
return item
def process(args: argparse.Namespace) -> dict[str, Any]:
repo_root = Path(__file__).resolve().parents[1]
manifest_path = Path(args.manifest).expanduser().resolve()
manifest = load_manifest(manifest_path)
lane_cfg, approved_root, root = resolve_lane_root(manifest, manifest_path, args.lane, args.root)
exts = allowed_exts(lane_cfg)
if not exts:
raise FailClosed(f"extensions_missing:{args.lane}")
manifest_limit = int(lane_cfg.get("max_files", 50))
limit = min(args.limit if args.limit is not None else manifest_limit, manifest_limit)
files, skipped, files_seen = iter_files(root, approved_root, exts, float(lane_cfg.get("max_file_mb", 25)), args.max_age_days)
selected = files[:limit]
npu_before = read_busy()
items: list[dict[str, Any]] = []
errors: list[str] = []
doc_module = None
if args.lane in AUDIO_LANES and not args.no_npu:
validate_local_whisper_url(args.whisper_url)
if args.lane in DOC_IMAGE_LANES and not args.no_npu:
try:
doc_module = load_doc_triage_module(repo_root)
except Exception as exc:
errors.append(f"doc_triage_import_error:{type(exc).__name__}")
for path in selected:
try:
if args.lane in AUDIO_LANES:
item = compact_audio_item(path, root, args.lane, args.no_npu, args.whisper_url, args.timeout_seconds)
elif doc_module is not None:
opts = doc_module.TriageOptions(
dry_run=False,
include_ocr_text=False,
include_full_path=False,
use_embeddings=not args.no_npu,
allowed_roots=[approved_root],
timeout_seconds=args.timeout_seconds,
)
item = compact_doc_item(path, root, args.lane, doc_module.triage_file(path, opts))
else:
item = fallback_doc_item(path, root, args.lane)
if args.include_full_path:
item["full_path"] = str(path)
item["full_path_included"] = True
if args.include_raw_text:
item["raw_text_included"] = False
item["raw_text_note"] = "unsupported_by_batch_wrapper"
items.append(item)
except FailClosed:
raise
except Exception as exc:
errors.append(f"{path.name}:{type(exc).__name__}")
items.append({"basename": path.name, "ok": False, "error": type(exc).__name__, "raw_text_redacted": True, "full_path_included": False})
npu_after = read_busy()
sysfs_delta = None if npu_before is None or npu_after is None else npu_after - npu_before
item_deltas = [i.get("npu_busy_delta_us") for i in items if isinstance(i.get("npu_busy_delta_us"), int)]
claimed = not args.no_npu and any((d or 0) > 0 for d in item_deltas + ([sysfs_delta] if isinstance(sysfs_delta, int) else []))
proof_ok = claimed and bool(sysfs_delta is None or sysfs_delta > 0 or any((d or 0) > 0 for d in item_deltas))
return {
"ok": not errors,
"lane": args.lane,
"dry_run": True,
"approved_root": True,
"root_basename": root.name,
"files_seen": files_seen,
"files_processed": len(items),
"skipped": skipped,
"npu": {"claimed": claimed, "busy_delta_us": sysfs_delta, "proof_ok": proof_ok},
"mutations": MUTATIONS_FALSE.copy(),
"items": items,
"raw_content_redacted": not args.include_raw_text,
"full_paths_included": bool(args.include_full_path),
"errors": errors,
"gates": {
"external_uploads": False,
"private_root_broadening": False,
"obsidian_mutation": False,
"vector_db_mutation": False,
"outbound_sends": False,
"routing_changes": False,
},
}
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Explicit-root dry-run batch triage wrapper")
parser.add_argument("--manifest", required=True, help="lane approval manifest; missing/unapproved fails closed")
parser.add_argument("--lane", required=True, choices=LANES)
parser.add_argument("--root", help="optional narrower root under the manifest-approved lane root")
parser.add_argument("--dry-run", action="store_true", help="required; mutation modes are not implemented")
parser.add_argument("--limit", type=int, default=None)
parser.add_argument("--max-age-days", type=float, default=None)
parser.add_argument("--include-raw-text", action="store_true", help="kept redacted by this wrapper; present only for explicit operator attempts")
parser.add_argument("--include-full-path", action="store_true", help="operator-only local debugging")
parser.add_argument("--no-npu", action="store_true", help="CPU-only smoke; never claims NPU")
parser.add_argument("--json", action="store_true", help="emit compact JSON")
parser.add_argument("--pretty", action="store_true", help="pretty JSON for local debugging")
parser.add_argument("--whisper-url", default=DEFAULT_WHISPER_URL)
parser.add_argument("--timeout-seconds", type=float, default=20.0)
return parser
def main(argv: list[str] | None = None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
if not args.dry_run:
print(json.dumps({"ok": False, "error": "dry_run_required", "mutations": MUTATIONS_FALSE}), file=sys.stderr)
return 2
if args.limit is not None and args.limit < 1:
print(json.dumps({"ok": False, "error": "limit_must_be_positive"}), file=sys.stderr)
return 2
try:
out = process(args)
except FailClosed as exc:
out = {"ok": False, "error": "fail_closed", "reason": str(exc), "dry_run": True, "mutations": MUTATIONS_FALSE.copy()}
print(json.dumps(out, indent=2 if args.pretty else None, sort_keys=True))
return 0 if out.get("ok") else 2
if __name__ == "__main__":
raise SystemExit(main())
+202
View File
@@ -0,0 +1,202 @@
from __future__ import annotations
import importlib.util
import json
import os
import subprocess
import sys
from pathlib import Path
import pytest
import yaml
ROOT = Path(__file__).resolve().parents[1]
SCRIPT = ROOT / "scripts" / "npu-batch-triage-dry-run.py"
def load_script_module():
spec = importlib.util.spec_from_file_location("npu_batch_triage_dry_run", SCRIPT)
assert spec and spec.loader
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def run_cli(*args: str) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[sys.executable, str(SCRIPT), *args],
cwd=ROOT,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)
def manifest(tmp_path: Path, lane: str = "screenshots", approved: bool = True, root: Path | None = None, exts: list[str] | None = None) -> Path:
root = root or tmp_path / "approved"
root.mkdir(parents=True, exist_ok=True)
data = {
"version": 1,
"policy": {
"default_mode": "dry_run",
"require_explicit_root": True,
"allow_external_uploads": False,
"allow_mutations": False,
"log_raw_text": False,
},
"roots": {
lane: {
"approved": approved,
"root": str(root),
"allowed_extensions": exts or [".png", ".txt", ".wav"],
"max_files": 10,
"max_file_mb": 5,
}
},
}
path = tmp_path / "triage-roots.yaml"
path.write_text(yaml.safe_dump(data))
return path
def test_missing_manifest_fails_closed(tmp_path: Path) -> None:
result = run_cli("--manifest", str(tmp_path / "missing.yaml"), "--lane", "screenshots", "--dry-run", "--no-npu", "--json")
out = json.loads(result.stdout)
assert result.returncode == 2
assert out["ok"] is False
assert out["error"] == "fail_closed"
assert "manifest_missing" in out["reason"]
assert all(v is False for v in out["mutations"].values())
def test_unapproved_lane_fails_closed(tmp_path: Path) -> None:
man = manifest(tmp_path, approved=False)
result = run_cli("--manifest", str(man), "--lane", "screenshots", "--dry-run", "--no-npu", "--json")
out = json.loads(result.stdout)
assert result.returncode == 2
assert out["ok"] is False
assert "lane_unapproved:screenshots" in out["reason"]
def test_request_root_cannot_broaden_manifest_root(tmp_path: Path) -> None:
approved = tmp_path / "approved" / "narrow"
man = manifest(tmp_path, root=approved)
broad = tmp_path / "approved"
result = run_cli("--manifest", str(man), "--lane", "screenshots", "--root", str(broad), "--dry-run", "--no-npu", "--json")
out = json.loads(result.stdout)
assert result.returncode == 2
assert out["ok"] is False
assert "request_root_broadens_approval:screenshots" in out["reason"]
def test_symlink_escape_is_skipped_and_output_redacted(tmp_path: Path) -> None:
approved = tmp_path / "approved"
approved.mkdir()
outside = tmp_path / "outside"
outside.mkdir()
(approved / "note.png").write_bytes(b"fake image")
(approved / "note.png.txt").write_text("Invoice payment due 2026-06-10 $42.00")
(outside / "secret.png").write_bytes(b"secret")
os.symlink(outside / "secret.png", approved / "escape.png")
man = manifest(tmp_path, root=approved, exts=[".png"])
result = run_cli("--manifest", str(man), "--lane", "screenshots", "--dry-run", "--no-npu", "--json")
out = json.loads(result.stdout)
assert result.returncode == 0
assert out["ok"] is True
assert out["files_processed"] == 1
assert out["skipped"]["symlink_escape"] == 1
item = out["items"][0]
assert item["basename"] == "note.png"
assert item["raw_text_redacted"] is True
assert item["full_path_included"] is False
assert "full_path" not in item
assert "Invoice" not in json.dumps(out)
assert out["npu"]["claimed"] is False
assert all(v is False for v in out["mutations"].values())
def test_committed_sample_manifest_cpu_smoke() -> None:
result = run_cli(
"--manifest",
"config/triage-roots.test.yaml",
"--lane",
"receipts",
"--root",
"openvino-doc-image-triage-npu/samples",
"--limit",
"2",
"--dry-run",
"--no-npu",
"--json",
)
out = json.loads(result.stdout)
assert result.returncode == 0
assert out["ok"] is True
assert out["lane"] == "receipts"
assert out["dry_run"] is True
assert out["files_processed"] == 2
assert out["npu"] == {"busy_delta_us": None, "claimed": False, "proof_ok": False} or out["npu"]["claimed"] is False
assert all(v is False for v in out["mutations"].values())
assert all(item.get("raw_text_redacted", True) for item in out["items"])
def test_audio_lane_no_npu_does_not_transcribe_or_claim(tmp_path: Path) -> None:
approved = tmp_path / "voice"
approved.mkdir()
(approved / "memo.wav").write_bytes(b"not really wav; no-npu mode must not decode")
man = manifest(tmp_path, lane="voice_memos", root=approved, exts=[".wav"])
result = run_cli("--manifest", str(man), "--lane", "voice_memos", "--dry-run", "--no-npu", "--json")
out = json.loads(result.stdout)
assert result.returncode == 0
item = out["items"][0]
assert item["transcribed"] is False
assert item["raw_transcript_logged"] is False
assert out["npu"]["claimed"] is False
assert all(v is False for v in out["mutations"].values())
def test_external_whisper_url_fails_closed_before_audio_read(tmp_path: Path) -> None:
module = load_script_module()
with pytest.raises(module.FailClosed, match="whisper_url_not_loopback"):
module.multipart_transcribe(tmp_path / "missing.wav", "http://example.com:18816/v1/audio/transcriptions", 0.01)
def test_audio_lane_rejects_external_whisper_url(tmp_path: Path) -> None:
approved = tmp_path / "voice"
approved.mkdir()
(approved / "memo.wav").write_bytes(b"synthetic audio bytes")
man = manifest(tmp_path, lane="voice_memos", root=approved, exts=[".wav"])
result = run_cli(
"--manifest",
str(man),
"--lane",
"voice_memos",
"--dry-run",
"--whisper-url",
"https://example.com/v1/audio/transcriptions",
"--json",
)
out = json.loads(result.stdout)
assert result.returncode == 2
assert out["ok"] is False
assert out["error"] == "fail_closed"
assert out["reason"] == "whisper_url_scheme_not_http"
assert out["mutations"] == {
"obsidian": False,
"rag": False,
"vector_db": False,
"sends": False,
"file_moves": False,
"routing": False,
"memory": False,
"service_restarts": False,
}
def test_localhost_whisper_url_is_allowed() -> None:
module = load_script_module()
assert module.validate_local_whisper_url("http://localhost:18816/v1/audio/transcriptions")
assert module.validate_local_whisper_url("http://127.0.0.1:18816/v1/audio/transcriptions")
assert module.validate_local_whisper_url("http://[::1]:18816/v1/audio/transcriptions")
View File