feat(voice): add OpenVINO NPU Whisper service

2026-06-04 13:07:51 -07:00
parent f9ef8b55ac
commit 83d0ced08c
3 changed files with 328 additions and 17 deletions
@@ -0,0 +1,147 @@
+import os
+import subprocess
+import tempfile
+import threading
+import time
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import openvino as ov
+import openvino_genai as ov_genai
+import soundfile as sf
+from fastapi import FastAPI, File, Form, UploadFile
+from fastapi.responses import JSONResponse, PlainTextResponse
+
+MODEL_DIR = Path(os.environ.get("WHISPER_MODEL_DIR", "/models/whisper-tiny-fp16-ov"))
+DEVICE = os.environ.get("WHISPER_DEVICE", "NPU")
+BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")
+
+app = FastAPI(title="OpenVINO NPU Whisper server", version="0.1.0")
+_lock = threading.Lock()
+_pipe = None
+_core = None
+
+
+def busy_us() -> Optional[int]:
+    try:
+        return int(BUSY_PATH.read_text().strip())
+    except Exception:
+        return None
+
+
+def get_core():
+    global _core
+    if _core is None:
+        _core = ov.Core()
+    return _core
+
+
+def get_pipe():
+    global _pipe
+    if _pipe is None:
+        _pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
+    return _pipe
+
+
+def load_audio(upload_path: Path) -> tuple[np.ndarray, int]:
+    """Decode arbitrary uploaded audio to mono 16 kHz float32 using ffmpeg + soundfile."""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav:
+        wav_path = Path(wav.name)
+    try:
+        subprocess.run(
+            [
+                "ffmpeg",
+                "-nostdin",
+                "-hide_banner",
+                "-loglevel",
+                "error",
+                "-y",
+                "-i",
+                str(upload_path),
+                "-ac",
+                "1",
+                "-ar",
+                "16000",
+                "-f",
+                "wav",
+                str(wav_path),
+            ],
+            check=True,
+        )
+        audio, sr = sf.read(wav_path, dtype="float32")
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)
+        return audio, int(sr)
+    finally:
+        try:
+            wav_path.unlink()
+        except FileNotFoundError:
+            pass
+
+
+@app.get("/")
+def root():
+    return PlainTextResponse("OpenVINO NPU Whisper server\n")
+
+
+@app.get("/health")
+def health():
+    try:
+        core = get_core()
+        devices = core.available_devices
+        npu_name = core.get_property("NPU", "FULL_DEVICE_NAME") if "NPU" in devices else None
+        return {
+            "ok": "NPU" in devices,
+            "device": DEVICE,
+            "devices": devices,
+            "npu": npu_name,
+            "model_dir": str(MODEL_DIR),
+            "model_exists": MODEL_DIR.exists(),
+            "npu_busy_time_us": busy_us(),
+        }
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"ok": False, "error": f"{type(e).__name__}: {e}"})
+
+
+@app.post("/v1/audio/transcriptions")
+async def transcriptions(
+    file: UploadFile = File(...),
+    model: Optional[str] = Form(default=None),
+    language: Optional[str] = Form(default=None),
+    response_format: Optional[str] = Form(default="json"),
+):
+    suffix = Path(file.filename or "audio").suffix or ".audio"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        upload_path = Path(tmp.name)
+        tmp.write(await file.read())
+
+    before = busy_us()
+    t0 = time.perf_counter()
+    try:
+        audio, sr = load_audio(upload_path)
+        # OpenVINO GenAI WhisperPipeline appears stateful for Whisper generation on
+        # this stack: reusing one pipeline produced unstable language detection on
+        # repeated short clips. Recreate per request for correctness; OpenVINO's
+        # compiled-cache path keeps warm init reasonably fast.
+        with _lock:
+            pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
+            result = pipe.generate(audio)
+        text = str(result).strip()
+        elapsed = time.perf_counter() - t0
+        after = busy_us()
+        if response_format == "text":
+            return PlainTextResponse(text)
+        return {
+            "text": text,
+            "duration_seconds": round(elapsed, 4),
+            "sample_rate": sr,
+            "device": DEVICE,
+            "model": model or MODEL_DIR.name,
+            "npu_busy_delta_us": None if before is None or after is None else after - before,
+        }
+    finally:
+        try:
+            upload_path.unlink()
+        except FileNotFoundError:
+            pass