import os import subprocess import tempfile import threading import time from pathlib import Path from typing import Optional import numpy as np import openvino as ov import openvino_genai as ov_genai import soundfile as sf from fastapi import FastAPI, File, Form, UploadFile from fastapi.responses import JSONResponse, PlainTextResponse MODEL_DIR = Path(os.environ.get("WHISPER_MODEL_DIR", "/models/whisper-tiny-fp16-ov")) DEVICE = os.environ.get("WHISPER_DEVICE", "NPU") BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us") app = FastAPI(title="OpenVINO NPU Whisper server", version="0.1.0") _lock = threading.Lock() _pipe = None _core = None def busy_us() -> Optional[int]: try: return int(BUSY_PATH.read_text().strip()) except Exception: return None def get_core(): global _core if _core is None: _core = ov.Core() return _core def get_pipe(): global _pipe if _pipe is None: _pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE) return _pipe def load_audio(upload_path: Path) -> tuple[np.ndarray, int]: """Decode arbitrary uploaded audio to mono 16 kHz float32 using ffmpeg + soundfile.""" with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav: wav_path = Path(wav.name) try: subprocess.run( [ "ffmpeg", "-nostdin", "-hide_banner", "-loglevel", "error", "-y", "-i", str(upload_path), "-ac", "1", "-ar", "16000", "-f", "wav", str(wav_path), ], check=True, ) audio, sr = sf.read(wav_path, dtype="float32") if audio.ndim > 1: audio = audio.mean(axis=1) return audio, int(sr) finally: try: wav_path.unlink() except FileNotFoundError: pass @app.get("/") def root(): return PlainTextResponse("OpenVINO NPU Whisper server\n") @app.get("/health") def health(): try: core = get_core() devices = core.available_devices npu_name = core.get_property("NPU", "FULL_DEVICE_NAME") if "NPU" in devices else None return { "ok": "NPU" in devices, "device": DEVICE, "devices": devices, "npu": npu_name, "model_dir": str(MODEL_DIR), "model_exists": MODEL_DIR.exists(), "npu_busy_time_us": busy_us(), } except Exception as e: return JSONResponse(status_code=500, content={"ok": False, "error": f"{type(e).__name__}: {e}"}) @app.post("/v1/audio/transcriptions") async def transcriptions( file: UploadFile = File(...), model: Optional[str] = Form(default=None), language: Optional[str] = Form(default=None), response_format: Optional[str] = Form(default="json"), ): suffix = Path(file.filename or "audio").suffix or ".audio" with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: upload_path = Path(tmp.name) tmp.write(await file.read()) before = busy_us() t0 = time.perf_counter() try: audio, sr = load_audio(upload_path) # OpenVINO GenAI WhisperPipeline appears stateful for Whisper generation on # this stack: reusing one pipeline produced unstable language detection on # repeated short clips. Recreate per request for correctness; OpenVINO's # compiled-cache path keeps warm init reasonably fast. with _lock: pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE) result = pipe.generate(audio) text = str(result).strip() elapsed = time.perf_counter() - t0 after = busy_us() if response_format == "text": return PlainTextResponse(text) return { "text": text, "duration_seconds": round(elapsed, 4), "sample_rate": sr, "device": DEVICE, "model": model or MODEL_DIR.name, "npu_busy_delta_us": None if before is None or after is None else after - before, } finally: try: upload_path.unlink() except FileNotFoundError: pass