swarm-master/whisper-openvino-npu/server.py

import os
import subprocess
import tempfile
import threading
import time
from pathlib import Path
from typing import Optional

import numpy as np
import openvino as ov
import openvino_genai as ov_genai
import soundfile as sf
from fastapi import FastAPI, File, Form, UploadFile
from fastapi.responses import JSONResponse, PlainTextResponse

MODEL_DIR = Path(os.environ.get("WHISPER_MODEL_DIR", "/models/whisper-tiny-fp16-ov"))
DEVICE = os.environ.get("WHISPER_DEVICE", "NPU")
BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us")

app = FastAPI(title="OpenVINO NPU Whisper server", version="0.1.0")
_lock = threading.Lock()
_pipe = None
_core = None


def busy_us() -> Optional[int]:
    try:
        return int(BUSY_PATH.read_text().strip())
    except Exception:
        return None


def get_core():
    global _core
    if _core is None:
        _core = ov.Core()
    return _core


def get_pipe():
    global _pipe
    if _pipe is None:
        _pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
    return _pipe


def load_audio(upload_path: Path) -> tuple[np.ndarray, int]:
    """Decode arbitrary uploaded audio to mono 16 kHz float32 using ffmpeg + soundfile."""
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav:
        wav_path = Path(wav.name)
    try:
        subprocess.run(
            [
                "ffmpeg",
                "-nostdin",
                "-hide_banner",
                "-loglevel",
                "error",
                "-y",
                "-i",
                str(upload_path),
                "-ac",
                "1",
                "-ar",
                "16000",
                "-f",
                "wav",
                str(wav_path),
            ],
            check=True,
        )
        audio, sr = sf.read(wav_path, dtype="float32")
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        return audio, int(sr)
    finally:
        try:
            wav_path.unlink()
        except FileNotFoundError:
            pass


@app.get("/")
def root():
    return PlainTextResponse("OpenVINO NPU Whisper server\n")


@app.get("/health")
def health():
    try:
        core = get_core()
        devices = core.available_devices
        npu_name = core.get_property("NPU", "FULL_DEVICE_NAME") if "NPU" in devices else None
        return {
            "ok": "NPU" in devices,
            "device": DEVICE,
            "devices": devices,
            "npu": npu_name,
            "model_dir": str(MODEL_DIR),
            "model_exists": MODEL_DIR.exists(),
            "npu_busy_time_us": busy_us(),
        }
    except Exception as e:
        return JSONResponse(status_code=500, content={"ok": False, "error": f"{type(e).__name__}: {e}"})


@app.post("/v1/audio/transcriptions")
async def transcriptions(
    file: UploadFile = File(...),
    model: Optional[str] = Form(default=None),
    language: Optional[str] = Form(default=None),
    response_format: Optional[str] = Form(default="json"),
):
    suffix = Path(file.filename or "audio").suffix or ".audio"
    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
        upload_path = Path(tmp.name)
        tmp.write(await file.read())

    before = busy_us()
    t0 = time.perf_counter()
    try:
        audio, sr = load_audio(upload_path)
        # OpenVINO GenAI WhisperPipeline appears stateful for Whisper generation on
        # this stack: reusing one pipeline produced unstable language detection on
        # repeated short clips. Recreate per request for correctness; OpenVINO's
        # compiled-cache path keeps warm init reasonably fast.
        with _lock:
            pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE)
            result = pipe.generate(audio)
        text = str(result).strip()
        elapsed = time.perf_counter() - t0
        after = busy_us()
        if response_format == "text":
            return PlainTextResponse(text)
        return {
            "text": text,
            "duration_seconds": round(elapsed, 4),
            "sample_rate": sr,
            "device": DEVICE,
            "model": model or MODEL_DIR.name,
            "npu_busy_delta_us": None if before is None or after is None else after - before,
        }
    finally:
        try:
            upload_path.unlink()
        except FileNotFoundError:
            pass