Files
swarm-master/openvino-genai-npu-worker/smoke_llm_npu.py
2026-06-04 13:07:51 -07:00

86 lines
3.0 KiB
Python

#!/usr/bin/env python3
"""Smoke-test OpenVINO GenAI LLMPipeline on Intel NPU.
This verifies NPU execution by reading /sys/class/accel/accel0/device/npu_busy_time_us
before and after generation. HTTP 200/service success is not considered proof.
"""
from __future__ import annotations
import argparse
import json
import time
from pathlib import Path
from typing import Any
DEFAULT_MODEL = "/home/will/models/openvino-genai/Qwen2.5-1.5B-Instruct-int4-ov"
DEFAULT_CACHE = "/home/will/.cache/openvino/genai-npu/qwen2.5-1.5b-int4"
BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
def import_openvino_genai() -> Any:
import openvino_genai as ov_genai # type: ignore[import-not-found]
return ov_genai
def read_busy(path: Path = BUSY_PATH) -> int:
return int(path.read_text().strip())
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--model", default=DEFAULT_MODEL)
parser.add_argument("--cache-dir", default=DEFAULT_CACHE)
parser.add_argument("--busy-path", default=str(BUSY_PATH))
parser.add_argument("--prompt", default="Write a concise title for: Synthetic NPU worker contract smoke with no routing changes.")
parser.add_argument("--max-new-tokens", type=int, default=24)
args = parser.parse_args()
model_path = Path(args.model)
cache_dir = Path(args.cache_dir)
busy_path = Path(args.busy_path)
cache_dir.mkdir(parents=True, exist_ok=True)
if not model_path.exists():
raise SystemExit(f"model path does not exist: {model_path}")
if not busy_path.exists():
raise SystemExit(f"NPU busy-time counter does not exist: {busy_path}")
if args.max_new_tokens < 1 or args.max_new_tokens > 256:
raise SystemExit("max-new-tokens must be between 1 and 256")
config = {
"CACHE_DIR": str(cache_dir),
"MAX_PROMPT_LEN": 1024,
"MIN_RESPONSE_LEN": 64,
"PREFILL_HINT": "DYNAMIC",
"GENERATE_HINT": "FAST_COMPILE",
}
ov_genai = import_openvino_genai()
before = read_busy(busy_path)
load_start = time.monotonic()
pipe = ov_genai.LLMPipeline(str(model_path), "NPU", **config)
load_ms = round((time.monotonic() - load_start) * 1000, 2)
gen_start = time.monotonic()
output = pipe.generate(args.prompt, max_new_tokens=args.max_new_tokens)
gen_ms = round((time.monotonic() - gen_start) * 1000, 2)
after = read_busy(busy_path)
result = {
"model": str(model_path),
"device": "NPU",
"cache_dir": str(cache_dir),
"prompt_chars": len(args.prompt),
"max_new_tokens": args.max_new_tokens,
"text": str(output).strip(),
"timing_ms": {"load": load_ms, "generate": gen_ms, "total": round(load_ms + gen_ms, 2)},
"npu_busy_before_us": before,
"npu_busy_after_us": after,
"npu_busy_delta_us": after - before,
}
print(json.dumps(result, indent=2))
return 0 if after > before else 2
if __name__ == "__main__":
raise SystemExit(main())