Files
2026-06-04 13:07:51 -07:00

168 lines
6.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""Smoke/benchmark checks for the OpenVINO reranker service.
Prints a JSON summary and exits non-zero on schema/ranking/NPU verification failure.
Uses only non-private fixture text.
"""
from __future__ import annotations
import argparse
import json
import statistics
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path
from typing import Any
NPU_BUSY_FILE = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
FIXTURES = [
{
"query": "how do I verify OpenVINO NPU usage?",
"documents": [
{"id": "good", "text": "Check /sys/class/accel/accel0/device/npu_busy_time_us before and after inference."},
{"id": "bad", "text": "This note is about making sourdough starter."},
],
"expected_top_id": "good",
},
{
"query": "what port does the reranker service use?",
"documents": [
{"id": "unrelated", "text": "Whisper transcription accepts audio uploads."},
{"id": "port", "text": "The OpenVINO reranker prototype listens locally on port 18818."},
],
"expected_top_id": "port",
},
{
"query": "why should reranking not mutate vector collections?",
"documents": [
{"id": "mutation", "text": "Reranking is a read-only second-stage transformation after vector search."},
{"id": "cooking", "text": "Boil pasta in salted water until al dente."},
],
"expected_top_id": "mutation",
},
]
def npu_busy_time_us() -> int | None:
try:
return int(NPU_BUSY_FILE.read_text().strip())
except Exception:
return None
def post_json(url: str, payload: dict[str, Any], timeout: float) -> tuple[int, dict[str, Any]]:
data = json.dumps(payload).encode("utf-8")
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"}, method="POST")
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
body = resp.read().decode("utf-8", "replace")
return resp.status, json.loads(body)
except urllib.error.HTTPError as exc:
body = exc.read().decode("utf-8", "replace")
try:
parsed = json.loads(body)
except Exception:
parsed = {"error": body}
return exc.code, parsed
def get_json(url: str, timeout: float) -> tuple[int, dict[str, Any]]:
try:
with urllib.request.urlopen(url, timeout=timeout) as resp:
body = resp.read().decode("utf-8", "replace")
return resp.status, json.loads(body)
except urllib.error.HTTPError as exc:
body = exc.read().decode("utf-8", "replace")
try:
parsed = json.loads(body)
except Exception:
parsed = {"error": body}
return exc.code, parsed
def percentile(values: list[float], pct: float) -> float | None:
if not values:
return None
ordered = sorted(values)
idx = min(len(ordered) - 1, max(0, round((pct / 100.0) * (len(ordered) - 1))))
return round(ordered[idx], 3)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--url", default="http://127.0.0.1:18818")
parser.add_argument("--timeout", type=float, default=20.0)
parser.add_argument("--allow-cpu", action="store_true", help="do not fail when health reports a non-NPU device")
args = parser.parse_args()
base = args.url.rstrip("/")
failures: list[str] = []
health_status, health = get_json(f"{base}/readyz", args.timeout)
if health_status != 200 or not health.get("ok"):
failures.append(f"readyz failed status={health_status} error={health.get('ready_error') or health.get('error')}")
device = health.get("device")
if device != "NPU" and not args.allow_cpu:
failures.append(f"device is {device!r}, expected 'NPU'")
latencies: list[float] = []
response_npu_total = 0
sysfs_npu_total = 0
top1_passed = 0
for case in FIXTURES:
before = npu_busy_time_us()
started = time.perf_counter()
status, payload = post_json(
f"{base}/rerank",
{"query": case["query"], "documents": case["documents"], "top_k": len(case["documents"]), "return_documents": False},
args.timeout,
)
wall_ms = (time.perf_counter() - started) * 1000
after = npu_busy_time_us()
latencies.append(float(payload.get("duration_ms") or wall_ms))
response_delta = payload.get("npu_busy_delta_us")
sysfs_delta = None if before is None or after is None else after - before
if isinstance(response_delta, int):
response_npu_total += response_delta
if isinstance(sysfs_delta, int):
sysfs_npu_total += sysfs_delta
results = payload.get("results") if isinstance(payload, dict) else None
top_id = results[0].get("id") if isinstance(results, list) and results else None
if status != 200 or not payload.get("ok"):
failures.append(f"case {case['expected_top_id']} HTTP/status failed: status={status} error={payload.get('error')}")
if not isinstance(results, list) or len(results) != len(case["documents"]):
failures.append(f"case {case['expected_top_id']} returned invalid results")
if top_id == case["expected_top_id"]:
top1_passed += 1
else:
failures.append(f"case {case['expected_top_id']} top_id={top_id!r}")
if device == "NPU":
if not isinstance(response_delta, int) or response_delta <= 0:
failures.append(f"case {case['expected_top_id']} response npu delta not positive: {response_delta}")
if not isinstance(sysfs_delta, int) or sysfs_delta <= 0:
failures.append(f"case {case['expected_top_id']} sysfs npu delta not positive: {sysfs_delta}")
summary = {
"ok": not failures,
"url": base,
"model": health.get("model"),
"device": device,
"cases": len(FIXTURES),
"top1_passed": top1_passed,
"p50_ms": percentile(latencies, 50),
"p95_ms": percentile(latencies, 95),
"mean_ms": round(statistics.mean(latencies), 3) if latencies else None,
"npu_busy_delta_us_total": sysfs_npu_total,
"response_npu_busy_delta_us_total": response_npu_total,
"failures": failures,
}
print(json.dumps(summary, indent=2, sort_keys=True))
return 0 if not failures else 1
if __name__ == "__main__":
raise SystemExit(main())