[verified] refresh OpenVINO NPU reranker prototype
This commit is contained in:
@@ -13,8 +13,9 @@ This service is intentionally not wired into live RAG by default.
|
||||
## Files
|
||||
|
||||
- `SPEC.md` — endpoint/CLI contract, model/runtime recommendation, smoke/NPU proof plan, RAG integration plan, docs implications, and no-go criteria.
|
||||
- `server.py` — stdlib HTTP OpenVINO Runtime service.
|
||||
- `server.py` — stdlib HTTP OpenVINO Runtime service with fail-fast localhost listener conflict checks and request validation.
|
||||
- `smoke.py` — non-private API/ranking/NPU busy-time smoke test.
|
||||
- `tests/test_server_validation.py` — stdlib unit checks for request validation and listener conflict detection.
|
||||
- `openvino-reranker.service` — optional user-systemd unit.
|
||||
|
||||
## One-time setup
|
||||
@@ -62,7 +63,7 @@ OPENVINO_RERANKER_MODEL_DIR=/home/will/.cache/openvino-models/rerankers/ms-marco
|
||||
python /home/will/lab/swarm/openvino-reranker-npu/server.py
|
||||
```
|
||||
|
||||
Startup performs a non-private smoke inference and fails closed when `OPENVINO_RERANKER_DEVICE=NPU` but `npu_busy_time_us` does not increase.
|
||||
Startup performs a non-private smoke inference and fails closed when `OPENVINO_RERANKER_DEVICE=NPU` but `npu_busy_time_us` does not increase. It also checks whether the requested listener can bind before compiling the OpenVINO model, so obvious port conflicts fail fast; the real server bind still happens immediately after model load.
|
||||
|
||||
## API
|
||||
|
||||
@@ -110,6 +111,16 @@ Expected:
|
||||
- The top result matches the non-private fixture expectation.
|
||||
- Response and sysfs `npu_busy_delta_us` are positive.
|
||||
|
||||
## Validation checks
|
||||
|
||||
```bash
|
||||
source /home/will/.venvs/openvino-reranker/bin/activate
|
||||
PYTHONPATH=/home/will/lab/swarm/openvino-reranker-npu \
|
||||
python -m unittest discover -s /home/will/lab/swarm/openvino-reranker-npu/tests
|
||||
```
|
||||
|
||||
These checks do not compile the OpenVINO model; they cover request validation and fail-fast listener conflict detection.
|
||||
|
||||
## Optional systemd user service
|
||||
|
||||
Install the unit only after the foreground command and smoke test pass:
|
||||
|
||||
@@ -105,7 +105,7 @@ Error response shape:
|
||||
|
||||
Status behavior:
|
||||
|
||||
- 400: invalid JSON schema, empty query, missing/empty documents, invalid document text.
|
||||
- 400: invalid JSON schema, empty query, missing/empty documents, invalid document text, or non-positive/non-integer `top_k`/`top_n`.
|
||||
- 413: request body above `OPENVINO_RERANKER_MAX_BODY_BYTES`.
|
||||
- 503: model not ready.
|
||||
- 500: unexpected inference/runtime failure.
|
||||
|
||||
@@ -16,6 +16,7 @@ import argparse
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import socket
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
@@ -251,6 +252,27 @@ def normalize_documents(value: Any, max_documents: int) -> list[dict[str, Any]]:
|
||||
return docs
|
||||
|
||||
|
||||
def parse_top_k(value: Any, document_count: int) -> int:
|
||||
"""Validate top_k/top_n before inference so schema errors return HTTP 400."""
|
||||
if value is None:
|
||||
return document_count
|
||||
if isinstance(value, bool) or not isinstance(value, int):
|
||||
raise ValueError("top_k/top_n must be a positive integer")
|
||||
if value < 1:
|
||||
raise ValueError("top_k/top_n must be a positive integer")
|
||||
return min(value, document_count)
|
||||
|
||||
|
||||
def assert_port_available(host: str, port: int) -> None:
|
||||
"""Fail fast on listener conflicts before compiling the OpenVINO model."""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
try:
|
||||
sock.bind((host, port))
|
||||
except OSError as exc:
|
||||
raise RuntimeError(f"cannot bind {host}:{port}; listener conflict or invalid bind: {exc}") from exc
|
||||
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
server_version = "OpenVINOReranker/0.1"
|
||||
|
||||
@@ -293,6 +315,7 @@ class Handler(BaseHTTPRequestHandler):
|
||||
raise ValueError("query is required")
|
||||
top_k = payload.get("top_k", payload.get("top_n"))
|
||||
documents = normalize_documents(payload.get("documents"), self.max_documents)
|
||||
top_k = parse_top_k(top_k, len(documents))
|
||||
return_documents = bool(payload.get("return_documents", True))
|
||||
response = self.svc.rerank(query.strip(), documents, top_k=top_k, return_documents=return_documents)
|
||||
self.write_json(response)
|
||||
@@ -342,6 +365,7 @@ def main() -> int:
|
||||
parser.add_argument("--skip-startup-smoke", action="store_true", default=os.environ.get("OPENVINO_RERANKER_SKIP_STARTUP_SMOKE", "").lower() in {"1", "true", "yes"})
|
||||
args = parser.parse_args()
|
||||
|
||||
assert_port_available(args.host, args.port)
|
||||
service = RerankerService(
|
||||
Path(args.model_dir).expanduser(),
|
||||
args.model,
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit checks for reranker request validation helpers.
|
||||
|
||||
These tests intentionally avoid loading an OpenVINO model; they only cover the
|
||||
stdlib validation helpers used before inference.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import socket
|
||||
import unittest
|
||||
|
||||
from server import assert_port_available, normalize_documents, parse_top_k
|
||||
|
||||
|
||||
class ValidationTests(unittest.TestCase):
|
||||
def test_normalize_accepts_strings_and_objects(self) -> None:
|
||||
docs = normalize_documents(
|
||||
[
|
||||
"plain text document",
|
||||
{"id": "obj", "text": "object document", "metadata": {"source": "synthetic"}},
|
||||
],
|
||||
max_documents=2,
|
||||
)
|
||||
self.assertEqual(docs[0], {"text": "plain text document"})
|
||||
self.assertEqual(docs[1]["id"], "obj")
|
||||
self.assertEqual(docs[1]["metadata"], {"source": "synthetic"})
|
||||
|
||||
def test_normalize_rejects_empty_or_too_many_documents(self) -> None:
|
||||
with self.assertRaisesRegex(ValueError, "non-empty"):
|
||||
normalize_documents([], max_documents=2)
|
||||
with self.assertRaisesRegex(ValueError, "max_documents"):
|
||||
normalize_documents(["a", "b", "c"], max_documents=2)
|
||||
with self.assertRaisesRegex(ValueError, "non-empty string"):
|
||||
normalize_documents([{"id": "empty", "text": ""}], max_documents=2)
|
||||
|
||||
def test_parse_top_k_defaults_clamps_and_rejects_invalid_values(self) -> None:
|
||||
self.assertEqual(parse_top_k(None, document_count=3), 3)
|
||||
self.assertEqual(parse_top_k(2, document_count=3), 2)
|
||||
self.assertEqual(parse_top_k(99, document_count=3), 3)
|
||||
for value in (0, -1, True, False, 1.5, "2", "nope"):
|
||||
with self.subTest(value=value):
|
||||
with self.assertRaisesRegex(ValueError, "positive integer"):
|
||||
parse_top_k(value, document_count=3)
|
||||
|
||||
def test_assert_port_available_detects_listener_conflict(self) -> None:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as listener:
|
||||
listener.bind(("127.0.0.1", 0))
|
||||
listener.listen(1)
|
||||
port = listener.getsockname()[1]
|
||||
with self.assertRaisesRegex(RuntimeError, "cannot bind"):
|
||||
assert_port_available("127.0.0.1", port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user