swarm-master/openvino-genai-npu-worker/smoke_llm_npu.py

#!/usr/bin/env python3
"""Smoke-test OpenVINO GenAI LLMPipeline on Intel NPU.

This verifies NPU execution by reading /sys/class/accel/accel0/device/npu_busy_time_us
before and after generation. HTTP 200/service success is not considered proof.
"""
from __future__ import annotations

import argparse
import json
import time
from pathlib import Path
from typing import Any

DEFAULT_MODEL = "/home/will/models/openvino-genai/Qwen2.5-1.5B-Instruct-int4-ov"
DEFAULT_CACHE = "/home/will/.cache/openvino/genai-npu/qwen2.5-1.5b-int4"
BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")


def import_openvino_genai() -> Any:
    import openvino_genai as ov_genai  # type: ignore[import-not-found]

    return ov_genai


def read_busy(path: Path = BUSY_PATH) -> int:
    return int(path.read_text().strip())


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default=DEFAULT_MODEL)
    parser.add_argument("--cache-dir", default=DEFAULT_CACHE)
    parser.add_argument("--busy-path", default=str(BUSY_PATH))
    parser.add_argument("--prompt", default="Write a concise title for: Synthetic NPU worker contract smoke with no routing changes.")
    parser.add_argument("--max-new-tokens", type=int, default=24)
    args = parser.parse_args()

    model_path = Path(args.model)
    cache_dir = Path(args.cache_dir)
    busy_path = Path(args.busy_path)
    cache_dir.mkdir(parents=True, exist_ok=True)
    if not model_path.exists():
        raise SystemExit(f"model path does not exist: {model_path}")
    if not busy_path.exists():
        raise SystemExit(f"NPU busy-time counter does not exist: {busy_path}")
    if args.max_new_tokens < 1 or args.max_new_tokens > 256:
        raise SystemExit("max-new-tokens must be between 1 and 256")

    config = {
        "CACHE_DIR": str(cache_dir),
        "MAX_PROMPT_LEN": 1024,
        "MIN_RESPONSE_LEN": 64,
        "PREFILL_HINT": "DYNAMIC",
        "GENERATE_HINT": "FAST_COMPILE",
    }

    ov_genai = import_openvino_genai()
    before = read_busy(busy_path)
    load_start = time.monotonic()
    pipe = ov_genai.LLMPipeline(str(model_path), "NPU", **config)
    load_ms = round((time.monotonic() - load_start) * 1000, 2)

    gen_start = time.monotonic()
    output = pipe.generate(args.prompt, max_new_tokens=args.max_new_tokens)
    gen_ms = round((time.monotonic() - gen_start) * 1000, 2)
    after = read_busy(busy_path)
    result = {
        "model": str(model_path),
        "device": "NPU",
        "cache_dir": str(cache_dir),
        "prompt_chars": len(args.prompt),
        "max_new_tokens": args.max_new_tokens,
        "text": str(output).strip(),
        "timing_ms": {"load": load_ms, "generate": gen_ms, "total": round(load_ms + gen_ms, 2)},
        "npu_busy_before_us": before,
        "npu_busy_after_us": after,
        "npu_busy_delta_us": after - before,
    }
    print(json.dumps(result, indent=2))
    return 0 if after > before else 2


if __name__ == "__main__":
    raise SystemExit(main())