From 83d0ced08ccccd7a5c739642b241b2e27bf58487 Mon Sep 17 00:00:00 2001 From: William Valentin Date: Thu, 4 Jun 2026 13:07:51 -0700 Subject: [PATCH] feat(voice): add OpenVINO NPU Whisper service --- docker-compose.yaml | 167 ++++++++++++++++++++++++++++---- whisper-openvino-npu/Dockerfile | 31 ++++++ whisper-openvino-npu/server.py | 147 ++++++++++++++++++++++++++++ 3 files changed, 328 insertions(+), 17 deletions(-) create mode 100644 whisper-openvino-npu/Dockerfile create mode 100644 whisper-openvino-npu/server.py diff --git a/docker-compose.yaml b/docker-compose.yaml index 28a7e08..0a92e16 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -30,31 +30,166 @@ services: # start_period: 15s # retries: 3 - # Optional local dependency: whisper.cpp server for audio transcription. - # Start with: docker compose --profile voice up -d whisper-server - whisper-server: - image: ghcr.io/ggml-org/whisper.cpp@sha256:3a39e86d5a0e911086b5cbebc9029cac71b02fbd08e217b775857de1490f55bf - container_name: whisper-server + # One-shot init: download whisper models into the shared volume if missing. + # The base image only ships ggml-base.en.bin; the servers below require: + # - ggml-medium.bin for the CPU server + # - ggml-small.bin for the GPU server (small fits in the limited VRAM left after gemma) + whisper-init: + image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09 + container_name: whisper-init + profiles: ["voice", "voice-cpu-backup"] + restart: "no" + volumes: + - whisper-models:/app/models + entrypoint: ["sh", "-c"] + command: + - | + set -e + for m in medium small base; do + if [ -f /app/models/ggml-$$m.bin ]; then + echo "Model ggml-$$m.bin already present, skipping download." + else + echo "Downloading ggml-$$m.bin..." + sh /app/models/download-ggml-model.sh $$m /app/models + fi + done + + # Manual GPU whisper.cpp fallback: NVIDIA RTX 5070 Ti via CUDA (Blackwell sm_120). + # Kept out of the normal `voice` profile because the OpenVINO NPU Whisper + # service is the default and this container consumes GPU resources. + # + # The official `ghcr.io/ggml-org/whisper.cpp:main-cuda` ships kernels only + # for sm_75/80/86/90 and fails to init CUDA on Blackwell. We build a custom + # image with `CMAKE_CUDA_ARCHITECTURES=120` from the local Dockerfile. + # Build manually with: docker build -t whisper.cpp:cuda-blackwell ./whisper-cuda-blackwell + # Or `docker compose --profile voice-gpu build whisper-server-gpu`. + whisper-server-gpu: + image: whisper.cpp:cuda-blackwell + build: + context: ./whisper-cuda-blackwell + dockerfile: Dockerfile + container_name: whisper-server-gpu restart: unless-stopped - profiles: ["voice"] + profiles: ["voice-gpu"] ports: - "18801:8080" volumes: - whisper-models:/app/models - # Override image entrypoint so args are passed directly to whisper-server. entrypoint: ["whisper-server"] command: - --model - - /app/models/ggml-base.en.bin + - /app/models/ggml-base.bin - --host - 0.0.0.0 - --port - "8080" - --convert - --language - - en + - auto - --inference-path - /v1/audio/transcriptions + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + depends_on: + whisper-init: + condition: service_completed_successfully + healthcheck: + test: + [ + "CMD-SHELL", + "curl -f http://localhost:8080/ >/dev/null 2>&1 || exit 1", + ] + interval: 30s + timeout: 5s + start_period: 30s + retries: 3 + labels: + agentmon.monitor: "true" + agentmon.role: "voice" + agentmon.port: "18801" + + # Experimental OpenVINO GenAI Whisper server using the Intel NPU. + # This is not whisper.cpp; it implements the same OpenAI-style + # /v1/audio/transcriptions route using OpenVINO WhisperPipeline on NPU. + # Host requirements: intel-npu-driver-bin installed, /dev/accel/accel0 present, + # and the host NPU Level Zero driver/compiler libraries mounted below. + whisper-server-npu: + image: whisper-openvino-npu:local + build: + context: ./whisper-openvino-npu + dockerfile: Dockerfile + container_name: whisper-server-npu + restart: unless-stopped + profiles: ["voice"] + ports: + - "18816:8080" + devices: + - /dev/accel/accel0:/dev/accel/accel0 + group_add: + - "987" # host render group gid on willlaptop + environment: + - WHISPER_DEVICE=NPU + - WHISPER_MODEL_DIR=/models/whisper-tiny-fp16-ov + - LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu + - ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1 + volumes: + - /home/will/.cache/openvino-models/whisper-tiny-fp16-ov:/models/whisper-tiny-fp16-ov:ro + - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1.32.1:ro + - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1:ro + - /usr/lib/x86_64-linux-gnu/libze_intel_npu.so:/usr/lib/x86_64-linux-gnu/libze_intel_npu.so:ro + - /usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:/usr/lib/x86_64-linux-gnu/libnpu_driver_compiler.so:ro + healthcheck: + test: + [ + "CMD-SHELL", + "curl -f http://localhost:8080/health >/dev/null 2>&1 || exit 1", + ] + interval: 30s + timeout: 5s + start_period: 30s + retries: 3 + labels: + agentmon.monitor: "true" + agentmon.role: "voice" + agentmon.port: "18816" + + # Manual fallback whisper.cpp server: CPU-only, medium model. + # Kept around for resilience — runs if the NPU/GPU servers are down. Uses no + # accelerator resources, but is slow (~14 s per short clip). + # Disabled from the normal `voice` profile now that `whisper-server-npu` is + # the trial default. Start manually with: + # docker compose --profile voice-cpu-backup up -d whisper-server + whisper-server: + image: ghcr.io/ggml-org/whisper.cpp@sha256:672650b5e67f9cb86af7ac6e09dea8eac12a024086e1e5c0172fdccf336aba09 + container_name: whisper-server + restart: unless-stopped + profiles: ["voice-cpu-backup"] + ports: + - "18811:8080" + volumes: + - whisper-models:/app/models + # Override image entrypoint so args are passed directly to whisper-server. + entrypoint: ["whisper-server"] + command: + - --model + - /app/models/ggml-medium.bin + - --host + - 0.0.0.0 + - --port + - "8080" + - --convert + - --language + - auto + - --inference-path + - /v1/audio/transcriptions + depends_on: + whisper-init: + condition: service_completed_successfully healthcheck: test: [ @@ -68,7 +203,7 @@ services: labels: agentmon.monitor: "true" agentmon.role: "voice" - agentmon.port: "18801" + agentmon.port: "18811" # kokoro TTS kokoro-tts: @@ -134,7 +269,7 @@ services: # Optional local dependency: liteLLM proxy for unified LLM API. # Start with: docker compose --profile api up -d litellm litellm: - image: litellm/litellm:v1.82.3-stable.patch.2 + image: litellm/litellm:v1.83.7-stable container_name: litellm restart: unless-stopped profiles: ["api"] @@ -142,7 +277,6 @@ services: - "18804:4000" volumes: - ./litellm-config.yaml:/app/config.yaml:ro - - ./litellm-copilot-tokens:/root/.config/litellm/github_copilot environment: - LITELLM_PORT=4000 - LITELLM_DROP_PARAMS=true @@ -151,7 +285,6 @@ services: - OPENROUTER_API_KEY=${OPENROUTER_API_KEY:-} - GEMINI_API_KEY=${GEMINI_API_KEY:-} - ZAI_API_KEY=${ZAI_API_KEY:-} - - GITHUB_COPILOT_TOKEN_DIR=/root/.config/litellm/github_copilot - DATABASE_URL=postgresql://litellm:litellm_password@litellm-db:5432/litellm - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY:-sk-1234} - LITELLM_SALT_KEY=${LITELLM_SALT_KEY:-} @@ -198,7 +331,7 @@ services: condition: service_healthy litellm-db: - image: postgres:15-alpine + image: postgres:15.17-alpine container_name: litellm-db restart: unless-stopped profiles: ["api"] @@ -221,7 +354,7 @@ services: # Dedicated local n8n instance for agent-oriented workflows. # Start with: docker compose --profile automation up -d n8n-agent n8n-agent: - image: docker.n8n.io/n8nio/n8n:2.11.3 + image: docker.n8n.io/n8nio/n8n:2.22.1 container_name: n8n-agent restart: unless-stopped profiles: ["automation"] @@ -233,8 +366,8 @@ services: - N8N_PROTOCOL=http - N8N_EDITOR_BASE_URL=http://localhost:18808 - WEBHOOK_URL=http://localhost:18808/ - - TZ=UTC - - GENERIC_TIMEZONE=UTC + - TZ=America/Los_Angeles + - GENERIC_TIMEZONE=America/Los_Angeles - N8N_SECURE_COOKIE=false volumes: - n8n-agent-data:/home/node/.n8n diff --git a/whisper-openvino-npu/Dockerfile b/whisper-openvino-npu/Dockerfile new file mode 100644 index 0000000..24759ae --- /dev/null +++ b/whisper-openvino-npu/Dockerfile @@ -0,0 +1,31 @@ +FROM python:3.14-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu \ + ZE_ENABLE_ALT_DRIVERS=/usr/lib/x86_64-linux-gnu/libze_intel_npu.so.1 + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ffmpeg libze1 ca-certificates curl \ + && rm -rf /var/lib/apt/lists/* + +RUN python -m pip install --upgrade pip \ + && python -m pip install \ + fastapi==0.126.0 \ + uvicorn[standard]==0.38.0 \ + python-multipart==0.0.22 \ + openvino==2026.2.0 \ + openvino-genai==2026.2.0.0 \ + soundfile==0.13.1 \ + numpy==2.4.6 + +WORKDIR /app +COPY server.py /app/server.py + +EXPOSE 8080 +HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ + CMD curl -fsS http://localhost:8080/health >/dev/null || exit 1 + +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/whisper-openvino-npu/server.py b/whisper-openvino-npu/server.py new file mode 100644 index 0000000..f7af663 --- /dev/null +++ b/whisper-openvino-npu/server.py @@ -0,0 +1,147 @@ +import os +import subprocess +import tempfile +import threading +import time +from pathlib import Path +from typing import Optional + +import numpy as np +import openvino as ov +import openvino_genai as ov_genai +import soundfile as sf +from fastapi import FastAPI, File, Form, UploadFile +from fastapi.responses import JSONResponse, PlainTextResponse + +MODEL_DIR = Path(os.environ.get("WHISPER_MODEL_DIR", "/models/whisper-tiny-fp16-ov")) +DEVICE = os.environ.get("WHISPER_DEVICE", "NPU") +BUSY_PATH = Path("/sys/class/accel/accel0/device/npu_busy_time_us") + +app = FastAPI(title="OpenVINO NPU Whisper server", version="0.1.0") +_lock = threading.Lock() +_pipe = None +_core = None + + +def busy_us() -> Optional[int]: + try: + return int(BUSY_PATH.read_text().strip()) + except Exception: + return None + + +def get_core(): + global _core + if _core is None: + _core = ov.Core() + return _core + + +def get_pipe(): + global _pipe + if _pipe is None: + _pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE) + return _pipe + + +def load_audio(upload_path: Path) -> tuple[np.ndarray, int]: + """Decode arbitrary uploaded audio to mono 16 kHz float32 using ffmpeg + soundfile.""" + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav: + wav_path = Path(wav.name) + try: + subprocess.run( + [ + "ffmpeg", + "-nostdin", + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + str(upload_path), + "-ac", + "1", + "-ar", + "16000", + "-f", + "wav", + str(wav_path), + ], + check=True, + ) + audio, sr = sf.read(wav_path, dtype="float32") + if audio.ndim > 1: + audio = audio.mean(axis=1) + return audio, int(sr) + finally: + try: + wav_path.unlink() + except FileNotFoundError: + pass + + +@app.get("/") +def root(): + return PlainTextResponse("OpenVINO NPU Whisper server\n") + + +@app.get("/health") +def health(): + try: + core = get_core() + devices = core.available_devices + npu_name = core.get_property("NPU", "FULL_DEVICE_NAME") if "NPU" in devices else None + return { + "ok": "NPU" in devices, + "device": DEVICE, + "devices": devices, + "npu": npu_name, + "model_dir": str(MODEL_DIR), + "model_exists": MODEL_DIR.exists(), + "npu_busy_time_us": busy_us(), + } + except Exception as e: + return JSONResponse(status_code=500, content={"ok": False, "error": f"{type(e).__name__}: {e}"}) + + +@app.post("/v1/audio/transcriptions") +async def transcriptions( + file: UploadFile = File(...), + model: Optional[str] = Form(default=None), + language: Optional[str] = Form(default=None), + response_format: Optional[str] = Form(default="json"), +): + suffix = Path(file.filename or "audio").suffix or ".audio" + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + upload_path = Path(tmp.name) + tmp.write(await file.read()) + + before = busy_us() + t0 = time.perf_counter() + try: + audio, sr = load_audio(upload_path) + # OpenVINO GenAI WhisperPipeline appears stateful for Whisper generation on + # this stack: reusing one pipeline produced unstable language detection on + # repeated short clips. Recreate per request for correctness; OpenVINO's + # compiled-cache path keeps warm init reasonably fast. + with _lock: + pipe = ov_genai.WhisperPipeline(str(MODEL_DIR), DEVICE) + result = pipe.generate(audio) + text = str(result).strip() + elapsed = time.perf_counter() - t0 + after = busy_us() + if response_format == "text": + return PlainTextResponse(text) + return { + "text": text, + "duration_seconds": round(elapsed, 4), + "sample_rate": sr, + "device": DEVICE, + "model": model or MODEL_DIR.name, + "npu_busy_delta_us": None if before is None or after is None else after - before, + } + finally: + try: + upload_path.unlink() + except FileNotFoundError: + pass