86 lines
3.0 KiB
Python
86 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Smoke-test OpenVINO GenAI LLMPipeline on Intel NPU.
|
|
|
|
This verifies NPU execution by reading /sys/class/accel/accel0/device/npu_busy_time_us
|
|
before and after generation. HTTP 200/service success is not considered proof.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
DEFAULT_MODEL = "/home/will/models/openvino-genai/Qwen2.5-1.5B-Instruct-int4-ov"
|
|
DEFAULT_CACHE = "/home/will/.cache/openvino/genai-npu/qwen2.5-1.5b-int4"
|
|
BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
|
|
|
|
|
|
def import_openvino_genai() -> Any:
|
|
import openvino_genai as ov_genai # type: ignore[import-not-found]
|
|
|
|
return ov_genai
|
|
|
|
|
|
def read_busy(path: Path = BUSY_PATH) -> int:
|
|
return int(path.read_text().strip())
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--model", default=DEFAULT_MODEL)
|
|
parser.add_argument("--cache-dir", default=DEFAULT_CACHE)
|
|
parser.add_argument("--busy-path", default=str(BUSY_PATH))
|
|
parser.add_argument("--prompt", default="Write a concise title for: Synthetic NPU worker contract smoke with no routing changes.")
|
|
parser.add_argument("--max-new-tokens", type=int, default=24)
|
|
args = parser.parse_args()
|
|
|
|
model_path = Path(args.model)
|
|
cache_dir = Path(args.cache_dir)
|
|
busy_path = Path(args.busy_path)
|
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
if not model_path.exists():
|
|
raise SystemExit(f"model path does not exist: {model_path}")
|
|
if not busy_path.exists():
|
|
raise SystemExit(f"NPU busy-time counter does not exist: {busy_path}")
|
|
if args.max_new_tokens < 1 or args.max_new_tokens > 256:
|
|
raise SystemExit("max-new-tokens must be between 1 and 256")
|
|
|
|
config = {
|
|
"CACHE_DIR": str(cache_dir),
|
|
"MAX_PROMPT_LEN": 1024,
|
|
"MIN_RESPONSE_LEN": 64,
|
|
"PREFILL_HINT": "DYNAMIC",
|
|
"GENERATE_HINT": "FAST_COMPILE",
|
|
}
|
|
|
|
ov_genai = import_openvino_genai()
|
|
before = read_busy(busy_path)
|
|
load_start = time.monotonic()
|
|
pipe = ov_genai.LLMPipeline(str(model_path), "NPU", **config)
|
|
load_ms = round((time.monotonic() - load_start) * 1000, 2)
|
|
|
|
gen_start = time.monotonic()
|
|
output = pipe.generate(args.prompt, max_new_tokens=args.max_new_tokens)
|
|
gen_ms = round((time.monotonic() - gen_start) * 1000, 2)
|
|
after = read_busy(busy_path)
|
|
result = {
|
|
"model": str(model_path),
|
|
"device": "NPU",
|
|
"cache_dir": str(cache_dir),
|
|
"prompt_chars": len(args.prompt),
|
|
"max_new_tokens": args.max_new_tokens,
|
|
"text": str(output).strip(),
|
|
"timing_ms": {"load": load_ms, "generate": gen_ms, "total": round(load_ms + gen_ms, 2)},
|
|
"npu_busy_before_us": before,
|
|
"npu_busy_after_us": after,
|
|
"npu_busy_delta_us": after - before,
|
|
}
|
|
print(json.dumps(result, indent=2))
|
|
return 0 if after > before else 2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|