#!/usr/bin/env python3 """Smoke-test OpenVINO GenAI LLMPipeline on Intel NPU. This verifies NPU execution by reading /sys/class/accel/accel0/device/npu_busy_time_us before and after generation. HTTP 200/service success is not considered proof. """ from __future__ import annotations import argparse import json import time from pathlib import Path from typing import Any DEFAULT_MODEL = "/home/will/models/openvino-genai/Qwen2.5-1.5B-Instruct-int4-ov" DEFAULT_CACHE = "/home/will/.cache/openvino/genai-npu/qwen2.5-1.5b-int4" BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us") def import_openvino_genai() -> Any: import openvino_genai as ov_genai # type: ignore[import-not-found] return ov_genai def read_busy(path: Path = BUSY_PATH) -> int: return int(path.read_text().strip()) def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--model", default=DEFAULT_MODEL) parser.add_argument("--cache-dir", default=DEFAULT_CACHE) parser.add_argument("--busy-path", default=str(BUSY_PATH)) parser.add_argument("--prompt", default="Write a concise title for: Synthetic NPU worker contract smoke with no routing changes.") parser.add_argument("--max-new-tokens", type=int, default=24) args = parser.parse_args() model_path = Path(args.model) cache_dir = Path(args.cache_dir) busy_path = Path(args.busy_path) cache_dir.mkdir(parents=True, exist_ok=True) if not model_path.exists(): raise SystemExit(f"model path does not exist: {model_path}") if not busy_path.exists(): raise SystemExit(f"NPU busy-time counter does not exist: {busy_path}") if args.max_new_tokens < 1 or args.max_new_tokens > 256: raise SystemExit("max-new-tokens must be between 1 and 256") config = { "CACHE_DIR": str(cache_dir), "MAX_PROMPT_LEN": 1024, "MIN_RESPONSE_LEN": 64, "PREFILL_HINT": "DYNAMIC", "GENERATE_HINT": "FAST_COMPILE", } ov_genai = import_openvino_genai() before = read_busy(busy_path) load_start = time.monotonic() pipe = ov_genai.LLMPipeline(str(model_path), "NPU", **config) load_ms = round((time.monotonic() - load_start) * 1000, 2) gen_start = time.monotonic() output = pipe.generate(args.prompt, max_new_tokens=args.max_new_tokens) gen_ms = round((time.monotonic() - gen_start) * 1000, 2) after = read_busy(busy_path) result = { "model": str(model_path), "device": "NPU", "cache_dir": str(cache_dir), "prompt_chars": len(args.prompt), "max_new_tokens": args.max_new_tokens, "text": str(output).strip(), "timing_ms": {"load": load_ms, "generate": gen_ms, "total": round(load_ms + gen_ms, 2)}, "npu_busy_before_us": before, "npu_busy_after_us": after, "npu_busy_delta_us": after - before, } print(json.dumps(result, indent=2)) return 0 if after > before else 2 if __name__ == "__main__": raise SystemExit(main())